diff --git a/tests/scripts/find_duplicate_documents_by_md5.php b/tests/scripts/find_duplicate_documents_by_md5.php new file mode 100644 index 0000000000..0b2187ef80 --- /dev/null +++ b/tests/scripts/find_duplicate_documents_by_md5.php @@ -0,0 +1,215 @@ + + */ +//exit; //remove this line to execute from the command line + +use ChamiloSession as Session; + +ini_set('memory_limit', '512M'); + +require_once __DIR__.'/../../main/inc/global.inc.php'; + +$html = false; +$htmlEOL = ''; +if (PHP_SAPI !== 'cli') { + $html = true; + $htmlEOL = '
'; + api_protect_admin_script(); + //die('This script can only be executed from the command line'); +} + +// Debug shows more output +$debug = false; +// Ignore template files +$ignoreTemplates = true; +$webCode = api_get_path(WEB_CODE_PATH).'document/document.php?cidReq='; + +if ($html) { + echo "".$htmlEOL; +} +echo "[".time()."] Querying courses$htmlEOL\n"; +$sql = "SELECT id, code, directory FROM course order by id"; + +$resCourse = Database::query($sql); +if ($resCourse === false) { + exit('Could not find any course'.PHP_EOL); +} +$countCourses = Database::num_rows($resCourse); +echo "[".time()."] Found $countCourses courses$htmlEOL".PHP_EOL; + +$md5Hashes = []; +$md5Sizes = []; +$totalDocs = 0; +$totalDocsSize = 0; +$uniqueDocs = 0; +$duplicateDocsCount = 0; +$totalDuplicateDocs = 0; +$courseCodeDirMatch = []; +$sysCoursePath = api_get_path(SYS_COURSE_PATH); + +// Search for duplicate tests, by looking for tests that have the exact same +// title in the same course +if ($debug) { + echo "[".time()."] Iterating on courses...$htmlEOL".PHP_EOL; +} +while ($course = Database::fetch_assoc($resCourse)) { + if ($debug) { + echo "Course ".$course['id'].' ('.$course['code'].')..'.$htmlEOL.PHP_EOL; + } + $courseCodeDirMatch[$course['code']] = $course['directory']; + $courseDir = $course['directory'].'/document'; + $baseWorkDir = $sysCoursePath.$courseDir; + $totalDocs += _scanSubDirs($baseWorkDir, $md5Hashes, $md5Sizes, $course['code'], $ignoreTemplates); +} // end while on course + +// Sort array by sizes +arsort($md5Sizes, SORT_NUMERIC); + +echo "[".time()."] Here is a list of duplicate files, joined together$htmlEOL".PHP_EOL; +echo " with a clickable link to each document:$htmlEOL".PHP_EOL; +foreach ($md5Sizes as $hash => $size) { + $files = $md5Hashes[$hash]; + $countFiles = $realFilesForThisHash = count($files); + $realFilePath = ''; + $i = 0; + if ($countFiles > 1) { + $kSize = floor((int) $files[0]['size']/1024); + echo ($html ? '' : '').$hash.", size: ".$kSize."KB ($countFiles copies): ".($html ? '' : 0).$htmlEOL.PHP_EOL; + if ($html) { + echo ""; + } + if ($realFilesForThisHash > 0) { + // if at least one of those was really a file, count it as unique + if ($debug) { + echo "$hash (".$file['path'].") has $realFilesForThisHash duplicates".PHP_EOL; + } + $uniqueDocs++; + } +} +$duplicateDocsCount = $totalDocs - $uniqueDocs; + +$sizeInMB = floor((int) $totalDocsSize / (1024*1024)); +echo "[".time()."] Found $totalDocs docs in total (including links).".PHP_EOL; +echo " Only $uniqueDocs were original files with duplicates.".PHP_EOL; +echo " $totalDuplicateDocs files remain, as duplicates versions of some of those $uniqueDocs.".PHP_EOL; +echo " Potential savings: ~$sizeInMB MB.".PHP_EOL; + +echo ""; + +/** + * Scans the given directory and its subdirectories and returns a number of + * files found + * @param string $path + * @param array &$md5Hashes + * @param string $courseCode + * @param bool $ignoreTemplates + * @return int + */ +function _scanSubDirs(string $path, array &$md5Hashes, array &$md5Sizes, string $courseCode, bool $ignoreTemplates = false): int +{ + global $debug; + $count = 0; + if (!is_dir($path)) { + return 0; + } + if (substr(basename($path), 0, 1) === '.') { + // If last path component starts with a '.', ignore + return 0; + } + if ($ignoreTemplates) { + if (preg_match('#/document/(audio|flash|images|video)#', $path)) { + if ($debug) { + echo $path." is part of the template, skipping".PHP_EOL; + } + return 0; + } + } + $list = scandir($path); + foreach ($list as $entry) { + if (substr($entry, 0, 1) === '.') { + // If the entry starts with a '.', ignore + continue; + } + $subPath = $path.'/'.$entry; + if ($ignoreTemplates && preg_match('#/document/index.html$#', $subPath)) { + continue; + } + if (is_dir($subPath)) { + $count += _scanSubDirs($subPath, $md5Hashes, $md5Sizes, $courseCode, $ignoreTemplates); + } else { + $fileMd5 = md5_file($subPath); + if ($fileMd5 === false) { + if ($debug) { + echo 'Skipping: There was an error calculating MD5 of '.$subPath.PHP_EOL; + } + continue; + } + $fileSize = filesize($subPath); + if ($fileSize > 0) { + if (!isset($md5Hashes[$fileMd5])) { + $md5Hashes[$fileMd5] = []; + } + $matches = []; + preg_match('#/document/(.*)#', $subPath, $matches); + $isLink = is_link($subPath); + $md5Hashes[$fileMd5][] = [ + 'course' => $courseCode, + 'link' => $isLink, + 'path' => $subPath, + 'size' => $fileSize, + 'subpath' => $matches[1], + ]; + $md5Sizes[$fileMd5] = $fileSize; + } else { + if ($debug) { + echo 'Skipping: Filesize 0 for '.$subPath.PHP_EOL; + } + continue; + } + $count++; + } + } + + return $count; +}