parent
be0467274d
commit
9971bb0d99
@ -0,0 +1,215 @@ |
|||||||
|
<?php |
||||||
|
/* For licensing terms, see /license.txt */ |
||||||
|
/** |
||||||
|
* This script finds duplicated documents to save disk space. |
||||||
|
* |
||||||
|
* It identifies duplicate documents by building an array of MD5 hashes |
||||||
|
* from the app/courses/[CODE]/document/ folders on your system |
||||||
|
* and checking for similar hashes everywhere in the other documents folders. |
||||||
|
* |
||||||
|
* This script should be located inside the tests/scripts/ folder to work |
||||||
|
* on the command line. To run it in a browser, move it to main/inc/ and |
||||||
|
* load it from there. |
||||||
|
* It can be run more than one time as it will only ever affect duplicate |
||||||
|
* documents. |
||||||
|
* |
||||||
|
* @author Yannick Warnier <yannick.warnier@beeznest.com> |
||||||
|
*/ |
||||||
|
//exit; //remove this line to execute from the command line |
||||||
|
|
||||||
|
use ChamiloSession as Session; |
||||||
|
|
||||||
|
ini_set('memory_limit', '512M'); |
||||||
|
|
||||||
|
require_once __DIR__.'/../../main/inc/global.inc.php'; |
||||||
|
|
||||||
|
$html = false; |
||||||
|
$htmlEOL = ''; |
||||||
|
if (PHP_SAPI !== 'cli') { |
||||||
|
$html = true; |
||||||
|
$htmlEOL = '<br />'; |
||||||
|
api_protect_admin_script(); |
||||||
|
//die('This script can only be executed from the command line'); |
||||||
|
} |
||||||
|
|
||||||
|
// Debug shows more output |
||||||
|
$debug = false; |
||||||
|
// Ignore template files |
||||||
|
$ignoreTemplates = true; |
||||||
|
$webCode = api_get_path(WEB_CODE_PATH).'document/document.php?cidReq='; |
||||||
|
|
||||||
|
if ($html) { |
||||||
|
echo "<html><body>".$htmlEOL; |
||||||
|
} |
||||||
|
echo "[".time()."] Querying courses$htmlEOL\n"; |
||||||
|
$sql = "SELECT id, code, directory FROM course order by id"; |
||||||
|
|
||||||
|
$resCourse = Database::query($sql); |
||||||
|
if ($resCourse === false) { |
||||||
|
exit('Could not find any course'.PHP_EOL); |
||||||
|
} |
||||||
|
$countCourses = Database::num_rows($resCourse); |
||||||
|
echo "[".time()."] Found $countCourses courses$htmlEOL".PHP_EOL; |
||||||
|
|
||||||
|
$md5Hashes = []; |
||||||
|
$md5Sizes = []; |
||||||
|
$totalDocs = 0; |
||||||
|
$totalDocsSize = 0; |
||||||
|
$uniqueDocs = 0; |
||||||
|
$duplicateDocsCount = 0; |
||||||
|
$totalDuplicateDocs = 0; |
||||||
|
$courseCodeDirMatch = []; |
||||||
|
$sysCoursePath = api_get_path(SYS_COURSE_PATH); |
||||||
|
|
||||||
|
// Search for duplicate tests, by looking for tests that have the exact same |
||||||
|
// title in the same course |
||||||
|
if ($debug) { |
||||||
|
echo "[".time()."] Iterating on courses...$htmlEOL".PHP_EOL; |
||||||
|
} |
||||||
|
while ($course = Database::fetch_assoc($resCourse)) { |
||||||
|
if ($debug) { |
||||||
|
echo "Course ".$course['id'].' ('.$course['code'].')..'.$htmlEOL.PHP_EOL; |
||||||
|
} |
||||||
|
$courseCodeDirMatch[$course['code']] = $course['directory']; |
||||||
|
$courseDir = $course['directory'].'/document'; |
||||||
|
$baseWorkDir = $sysCoursePath.$courseDir; |
||||||
|
$totalDocs += _scanSubDirs($baseWorkDir, $md5Hashes, $md5Sizes, $course['code'], $ignoreTemplates); |
||||||
|
} // end while on course |
||||||
|
|
||||||
|
// Sort array by sizes |
||||||
|
arsort($md5Sizes, SORT_NUMERIC); |
||||||
|
|
||||||
|
echo "[".time()."] Here is a list of duplicate files, joined together$htmlEOL".PHP_EOL; |
||||||
|
echo " with a clickable link to each document:$htmlEOL".PHP_EOL; |
||||||
|
foreach ($md5Sizes as $hash => $size) { |
||||||
|
$files = $md5Hashes[$hash]; |
||||||
|
$countFiles = $realFilesForThisHash = count($files); |
||||||
|
$realFilePath = ''; |
||||||
|
$i = 0; |
||||||
|
if ($countFiles > 1) { |
||||||
|
$kSize = floor((int) $files[0]['size']/1024); |
||||||
|
echo ($html ? '<b>' : '').$hash.", size: ".$kSize."KB ($countFiles copies): ".($html ? '</b>' : 0).$htmlEOL.PHP_EOL; |
||||||
|
if ($html) { |
||||||
|
echo "<ul>"; |
||||||
|
} |
||||||
|
} else { |
||||||
|
continue; |
||||||
|
} |
||||||
|
foreach ($files as $file) { |
||||||
|
if ($file['link']) { |
||||||
|
// This is a link. Discount from real files |
||||||
|
// and do not count as potential space savings |
||||||
|
$realFilesForThisHash--; |
||||||
|
$totalLinks++; |
||||||
|
} else { |
||||||
|
if ($i != 0) { |
||||||
|
// We don't count the first file as a duplicate |
||||||
|
// nor as potential space savings |
||||||
|
$totalDuplicateDocs++; |
||||||
|
$totalDocsSize += $file['size']; |
||||||
|
} |
||||||
|
if ($html) { |
||||||
|
echo "<li><a target='_blank' href='".$webCode.$file['course']."'>".$file['subpath']."(".$file['course'].")</a></li>".PHP_EOL; |
||||||
|
} else { |
||||||
|
echo $webCode.$file['course'].", as ".$file['path'].$htmlEOL.PHP_EOL; |
||||||
|
} |
||||||
|
$i++; |
||||||
|
} |
||||||
|
} |
||||||
|
if ($html) { |
||||||
|
echo "</ul>"; |
||||||
|
} |
||||||
|
if ($realFilesForThisHash > 0) { |
||||||
|
// if at least one of those was really a file, count it as unique |
||||||
|
if ($debug) { |
||||||
|
echo "$hash (".$file['path'].") has $realFilesForThisHash duplicates".PHP_EOL; |
||||||
|
} |
||||||
|
$uniqueDocs++; |
||||||
|
} |
||||||
|
} |
||||||
|
$duplicateDocsCount = $totalDocs - $uniqueDocs; |
||||||
|
|
||||||
|
$sizeInMB = floor((int) $totalDocsSize / (1024*1024)); |
||||||
|
echo "[".time()."] Found $totalDocs docs in total (including links).".PHP_EOL; |
||||||
|
echo " Only $uniqueDocs were original files with duplicates.".PHP_EOL; |
||||||
|
echo " $totalDuplicateDocs files remain, as duplicates versions of some of those $uniqueDocs.".PHP_EOL; |
||||||
|
echo " Potential savings: ~$sizeInMB MB.".PHP_EOL; |
||||||
|
|
||||||
|
echo "</body></html>"; |
||||||
|
|
||||||
|
/** |
||||||
|
* Scans the given directory and its subdirectories and returns a number of |
||||||
|
* files found |
||||||
|
* @param string $path |
||||||
|
* @param array &$md5Hashes |
||||||
|
* @param string $courseCode |
||||||
|
* @param bool $ignoreTemplates |
||||||
|
* @return int |
||||||
|
*/ |
||||||
|
function _scanSubDirs(string $path, array &$md5Hashes, array &$md5Sizes, string $courseCode, bool $ignoreTemplates = false): int |
||||||
|
{ |
||||||
|
global $debug; |
||||||
|
$count = 0; |
||||||
|
if (!is_dir($path)) { |
||||||
|
return 0; |
||||||
|
} |
||||||
|
if (substr(basename($path), 0, 1) === '.') { |
||||||
|
// If last path component starts with a '.', ignore |
||||||
|
return 0; |
||||||
|
} |
||||||
|
if ($ignoreTemplates) { |
||||||
|
if (preg_match('#/document/(audio|flash|images|video)#', $path)) { |
||||||
|
if ($debug) { |
||||||
|
echo $path." is part of the template, skipping".PHP_EOL; |
||||||
|
} |
||||||
|
return 0; |
||||||
|
} |
||||||
|
} |
||||||
|
$list = scandir($path); |
||||||
|
foreach ($list as $entry) { |
||||||
|
if (substr($entry, 0, 1) === '.') { |
||||||
|
// If the entry starts with a '.', ignore |
||||||
|
continue; |
||||||
|
} |
||||||
|
$subPath = $path.'/'.$entry; |
||||||
|
if ($ignoreTemplates && preg_match('#/document/index.html$#', $subPath)) { |
||||||
|
continue; |
||||||
|
} |
||||||
|
if (is_dir($subPath)) { |
||||||
|
$count += _scanSubDirs($subPath, $md5Hashes, $md5Sizes, $courseCode, $ignoreTemplates); |
||||||
|
} else { |
||||||
|
$fileMd5 = md5_file($subPath); |
||||||
|
if ($fileMd5 === false) { |
||||||
|
if ($debug) { |
||||||
|
echo 'Skipping: There was an error calculating MD5 of '.$subPath.PHP_EOL; |
||||||
|
} |
||||||
|
continue; |
||||||
|
} |
||||||
|
$fileSize = filesize($subPath); |
||||||
|
if ($fileSize > 0) { |
||||||
|
if (!isset($md5Hashes[$fileMd5])) { |
||||||
|
$md5Hashes[$fileMd5] = []; |
||||||
|
} |
||||||
|
$matches = []; |
||||||
|
preg_match('#/document/(.*)#', $subPath, $matches); |
||||||
|
$isLink = is_link($subPath); |
||||||
|
$md5Hashes[$fileMd5][] = [ |
||||||
|
'course' => $courseCode, |
||||||
|
'link' => $isLink, |
||||||
|
'path' => $subPath, |
||||||
|
'size' => $fileSize, |
||||||
|
'subpath' => $matches[1], |
||||||
|
]; |
||||||
|
$md5Sizes[$fileMd5] = $fileSize; |
||||||
|
} else { |
||||||
|
if ($debug) { |
||||||
|
echo 'Skipping: Filesize 0 for '.$subPath.PHP_EOL; |
||||||
|
} |
||||||
|
continue; |
||||||
|
} |
||||||
|
$count++; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return $count; |
||||||
|
} |
||||||
Loading…
Reference in new issue