parent
							
								
									be0467274d
								
							
						
					
					
						commit
						9971bb0d99
					
				@ -0,0 +1,215 @@ | 
				
			|||||||
 | 
					<?php | 
				
			||||||
 | 
					/* For licensing terms, see /license.txt */ | 
				
			||||||
 | 
					/** | 
				
			||||||
 | 
					 * This script finds duplicated documents to save disk space. | 
				
			||||||
 | 
					 * | 
				
			||||||
 | 
					 * It identifies duplicate documents by building an array of MD5 hashes | 
				
			||||||
 | 
					 * from the app/courses/[CODE]/document/ folders on your system | 
				
			||||||
 | 
					 * and checking for similar hashes everywhere in the other documents folders. | 
				
			||||||
 | 
					 * | 
				
			||||||
 | 
					 * This script should be located inside the tests/scripts/ folder to work | 
				
			||||||
 | 
					 * on the command line. To run it in a browser, move it to main/inc/ and | 
				
			||||||
 | 
					 * load it from there. | 
				
			||||||
 | 
					 * It can be run more than one time as it will only ever affect duplicate | 
				
			||||||
 | 
					 * documents. | 
				
			||||||
 | 
					 * | 
				
			||||||
 | 
					 * @author Yannick Warnier <yannick.warnier@beeznest.com> | 
				
			||||||
 | 
					 */ | 
				
			||||||
 | 
					//exit; //remove this line to execute from the command line | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					use ChamiloSession as Session; | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ini_set('memory_limit', '512M'); | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					require_once __DIR__.'/../../main/inc/global.inc.php'; | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					$html = false; | 
				
			||||||
 | 
					$htmlEOL = ''; | 
				
			||||||
 | 
					if (PHP_SAPI !== 'cli') { | 
				
			||||||
 | 
					    $html = true; | 
				
			||||||
 | 
					    $htmlEOL = '<br />'; | 
				
			||||||
 | 
					    api_protect_admin_script(); | 
				
			||||||
 | 
					    //die('This script can only be executed from the command line'); | 
				
			||||||
 | 
					} | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Debug shows more output | 
				
			||||||
 | 
					$debug = false; | 
				
			||||||
 | 
					// Ignore template files | 
				
			||||||
 | 
					$ignoreTemplates = true; | 
				
			||||||
 | 
					$webCode = api_get_path(WEB_CODE_PATH).'document/document.php?cidReq='; | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if ($html) { | 
				
			||||||
 | 
					    echo "<html><body>".$htmlEOL; | 
				
			||||||
 | 
					} | 
				
			||||||
 | 
					echo "[".time()."] Querying courses$htmlEOL\n"; | 
				
			||||||
 | 
					$sql = "SELECT id, code, directory FROM course order by id"; | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					$resCourse = Database::query($sql); | 
				
			||||||
 | 
					if ($resCourse === false) { | 
				
			||||||
 | 
					    exit('Could not find any course'.PHP_EOL); | 
				
			||||||
 | 
					} | 
				
			||||||
 | 
					$countCourses = Database::num_rows($resCourse); | 
				
			||||||
 | 
					echo "[".time()."] Found $countCourses courses$htmlEOL".PHP_EOL; | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					$md5Hashes = []; | 
				
			||||||
 | 
					$md5Sizes = []; | 
				
			||||||
 | 
					$totalDocs = 0; | 
				
			||||||
 | 
					$totalDocsSize = 0; | 
				
			||||||
 | 
					$uniqueDocs = 0; | 
				
			||||||
 | 
					$duplicateDocsCount = 0; | 
				
			||||||
 | 
					$totalDuplicateDocs = 0; | 
				
			||||||
 | 
					$courseCodeDirMatch = []; | 
				
			||||||
 | 
					$sysCoursePath = api_get_path(SYS_COURSE_PATH); | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Search for duplicate tests, by looking for tests that have the exact same | 
				
			||||||
 | 
					// title in the same course | 
				
			||||||
 | 
					if ($debug) { | 
				
			||||||
 | 
					    echo "[".time()."] Iterating on courses...$htmlEOL".PHP_EOL; | 
				
			||||||
 | 
					} | 
				
			||||||
 | 
					while ($course = Database::fetch_assoc($resCourse)) { | 
				
			||||||
 | 
					    if ($debug) { | 
				
			||||||
 | 
					        echo "Course ".$course['id'].' ('.$course['code'].')..'.$htmlEOL.PHP_EOL; | 
				
			||||||
 | 
					    } | 
				
			||||||
 | 
					    $courseCodeDirMatch[$course['code']] = $course['directory']; | 
				
			||||||
 | 
					    $courseDir = $course['directory'].'/document'; | 
				
			||||||
 | 
					    $baseWorkDir = $sysCoursePath.$courseDir; | 
				
			||||||
 | 
					    $totalDocs += _scanSubDirs($baseWorkDir, $md5Hashes, $md5Sizes, $course['code'], $ignoreTemplates); | 
				
			||||||
 | 
					} // end while on course | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Sort array by sizes | 
				
			||||||
 | 
					arsort($md5Sizes, SORT_NUMERIC); | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo "[".time()."] Here is a list of duplicate files, joined together$htmlEOL".PHP_EOL; | 
				
			||||||
 | 
					echo "             with a clickable link to each document:$htmlEOL".PHP_EOL; | 
				
			||||||
 | 
					foreach ($md5Sizes as $hash => $size) { | 
				
			||||||
 | 
					    $files = $md5Hashes[$hash]; | 
				
			||||||
 | 
					    $countFiles = $realFilesForThisHash = count($files); | 
				
			||||||
 | 
					    $realFilePath = ''; | 
				
			||||||
 | 
					    $i = 0; | 
				
			||||||
 | 
					    if ($countFiles > 1) { | 
				
			||||||
 | 
					        $kSize = floor((int) $files[0]['size']/1024); | 
				
			||||||
 | 
					        echo ($html ? '<b>' : '').$hash.", size: ".$kSize."KB ($countFiles copies): ".($html ? '</b>' : 0).$htmlEOL.PHP_EOL; | 
				
			||||||
 | 
					        if ($html) { | 
				
			||||||
 | 
					            echo "<ul>"; | 
				
			||||||
 | 
					        } | 
				
			||||||
 | 
					    } else { | 
				
			||||||
 | 
					        continue; | 
				
			||||||
 | 
					    } | 
				
			||||||
 | 
					    foreach ($files as $file) { | 
				
			||||||
 | 
					        if ($file['link']) { | 
				
			||||||
 | 
					            // This is a link. Discount from real files | 
				
			||||||
 | 
					            // and do not count as potential space savings | 
				
			||||||
 | 
					            $realFilesForThisHash--; | 
				
			||||||
 | 
					            $totalLinks++; | 
				
			||||||
 | 
					        } else { | 
				
			||||||
 | 
					            if ($i != 0) { | 
				
			||||||
 | 
					                // We don't count the first file as a duplicate | 
				
			||||||
 | 
					                // nor as potential space savings | 
				
			||||||
 | 
					                $totalDuplicateDocs++; | 
				
			||||||
 | 
					                $totalDocsSize += $file['size']; | 
				
			||||||
 | 
					            } | 
				
			||||||
 | 
					            if ($html) { | 
				
			||||||
 | 
					                echo "<li><a target='_blank' href='".$webCode.$file['course']."'>".$file['subpath']."(".$file['course'].")</a></li>".PHP_EOL; | 
				
			||||||
 | 
					            } else { | 
				
			||||||
 | 
					                echo $webCode.$file['course'].", as ".$file['path'].$htmlEOL.PHP_EOL; | 
				
			||||||
 | 
					            } | 
				
			||||||
 | 
					            $i++; | 
				
			||||||
 | 
					        } | 
				
			||||||
 | 
					    } | 
				
			||||||
 | 
					    if ($html) { | 
				
			||||||
 | 
					        echo "</ul>"; | 
				
			||||||
 | 
					    } | 
				
			||||||
 | 
					    if ($realFilesForThisHash > 0) { | 
				
			||||||
 | 
					        // if at least one of those was really a file, count it as unique | 
				
			||||||
 | 
					        if ($debug) { | 
				
			||||||
 | 
					            echo "$hash (".$file['path'].") has $realFilesForThisHash duplicates".PHP_EOL; | 
				
			||||||
 | 
					        } | 
				
			||||||
 | 
					        $uniqueDocs++; | 
				
			||||||
 | 
					    } | 
				
			||||||
 | 
					} | 
				
			||||||
 | 
					$duplicateDocsCount = $totalDocs - $uniqueDocs; | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					$sizeInMB = floor((int) $totalDocsSize / (1024*1024)); | 
				
			||||||
 | 
					echo "[".time()."] Found $totalDocs docs in total (including links).".PHP_EOL; | 
				
			||||||
 | 
					echo "             Only $uniqueDocs were original files with duplicates.".PHP_EOL; | 
				
			||||||
 | 
					echo "             $totalDuplicateDocs files remain, as duplicates versions of some of those $uniqueDocs.".PHP_EOL; | 
				
			||||||
 | 
					echo "             Potential savings: ~$sizeInMB MB.".PHP_EOL; | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo "</body></html>"; | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** | 
				
			||||||
 | 
					 * Scans the given directory and its subdirectories and returns a number of | 
				
			||||||
 | 
					 * files found | 
				
			||||||
 | 
					 * @param string $path | 
				
			||||||
 | 
					 * @param array &$md5Hashes | 
				
			||||||
 | 
					 * @param string $courseCode | 
				
			||||||
 | 
					 * @param bool $ignoreTemplates | 
				
			||||||
 | 
					 * @return int | 
				
			||||||
 | 
					 */ | 
				
			||||||
 | 
					function _scanSubDirs(string $path, array &$md5Hashes, array &$md5Sizes, string $courseCode, bool $ignoreTemplates = false): int | 
				
			||||||
 | 
					{ | 
				
			||||||
 | 
					    global $debug; | 
				
			||||||
 | 
					    $count = 0; | 
				
			||||||
 | 
					    if (!is_dir($path)) { | 
				
			||||||
 | 
					        return 0; | 
				
			||||||
 | 
					    } | 
				
			||||||
 | 
					    if (substr(basename($path), 0, 1) === '.') { | 
				
			||||||
 | 
					        // If last path component starts with a '.', ignore | 
				
			||||||
 | 
					        return 0; | 
				
			||||||
 | 
					    } | 
				
			||||||
 | 
					    if ($ignoreTemplates) { | 
				
			||||||
 | 
					        if (preg_match('#/document/(audio|flash|images|video)#', $path)) { | 
				
			||||||
 | 
					            if ($debug) { | 
				
			||||||
 | 
					                echo $path." is part of the template, skipping".PHP_EOL; | 
				
			||||||
 | 
					            } | 
				
			||||||
 | 
					            return 0; | 
				
			||||||
 | 
					        } | 
				
			||||||
 | 
					    } | 
				
			||||||
 | 
					    $list = scandir($path); | 
				
			||||||
 | 
					    foreach ($list as $entry) { | 
				
			||||||
 | 
					        if (substr($entry, 0, 1) === '.') { | 
				
			||||||
 | 
					            // If the entry starts with a '.', ignore | 
				
			||||||
 | 
					            continue; | 
				
			||||||
 | 
					        } | 
				
			||||||
 | 
					        $subPath = $path.'/'.$entry; | 
				
			||||||
 | 
					        if ($ignoreTemplates && preg_match('#/document/index.html$#', $subPath)) { | 
				
			||||||
 | 
					            continue; | 
				
			||||||
 | 
					        } | 
				
			||||||
 | 
					        if (is_dir($subPath)) { | 
				
			||||||
 | 
					            $count += _scanSubDirs($subPath, $md5Hashes, $md5Sizes, $courseCode, $ignoreTemplates); | 
				
			||||||
 | 
					        } else { | 
				
			||||||
 | 
					            $fileMd5 = md5_file($subPath); | 
				
			||||||
 | 
					            if ($fileMd5 === false) { | 
				
			||||||
 | 
					                if ($debug) { | 
				
			||||||
 | 
					                    echo 'Skipping: There was an error calculating MD5 of '.$subPath.PHP_EOL; | 
				
			||||||
 | 
					                } | 
				
			||||||
 | 
					                continue; | 
				
			||||||
 | 
					            } | 
				
			||||||
 | 
					            $fileSize = filesize($subPath); | 
				
			||||||
 | 
					            if ($fileSize > 0) { | 
				
			||||||
 | 
					                if (!isset($md5Hashes[$fileMd5])) { | 
				
			||||||
 | 
					                    $md5Hashes[$fileMd5] = []; | 
				
			||||||
 | 
					                } | 
				
			||||||
 | 
					                $matches = []; | 
				
			||||||
 | 
					                preg_match('#/document/(.*)#', $subPath, $matches); | 
				
			||||||
 | 
					                $isLink = is_link($subPath); | 
				
			||||||
 | 
					                $md5Hashes[$fileMd5][] = [ | 
				
			||||||
 | 
					                    'course' => $courseCode, | 
				
			||||||
 | 
					                    'link' => $isLink, | 
				
			||||||
 | 
					                    'path' => $subPath, | 
				
			||||||
 | 
					                    'size' => $fileSize, | 
				
			||||||
 | 
					                    'subpath' => $matches[1], | 
				
			||||||
 | 
					                ]; | 
				
			||||||
 | 
					                $md5Sizes[$fileMd5] = $fileSize; | 
				
			||||||
 | 
					            } else { | 
				
			||||||
 | 
					                if ($debug) { | 
				
			||||||
 | 
					                    echo 'Skipping: Filesize 0 for '.$subPath.PHP_EOL; | 
				
			||||||
 | 
					                } | 
				
			||||||
 | 
					                continue; | 
				
			||||||
 | 
					            } | 
				
			||||||
 | 
					            $count++; | 
				
			||||||
 | 
					        } | 
				
			||||||
 | 
					    } | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return $count; | 
				
			||||||
 | 
					} | 
				
			||||||
					Loading…
					
					
				
		Reference in new issue