*/
exit; //remove this line to execute from the command line
use ChamiloSession as Session;
ini_set('memory_limit', '512M');
require_once __DIR__.'/../../main/inc/global.inc.php';
$html = false;
$htmlEOL = '';
if (PHP_SAPI !== 'cli') {
$html = true;
$htmlEOL = '
';
api_protect_admin_script();
//die('This script can only be executed from the command line');
}
// Debug shows more output
$debug = false;
// Ignore template files
$ignoreTemplates = true;
$webCode = api_get_path(WEB_CODE_PATH).'document/document.php?cidReq=';
if ($html) {
echo "
".$htmlEOL;
}
echo "[".time()."] Querying courses$htmlEOL\n";
$sql = "SELECT id, code, directory FROM course order by id";
$resCourse = Database::query($sql);
if ($resCourse === false) {
exit('Could not find any course'.PHP_EOL);
}
$countCourses = Database::num_rows($resCourse);
echo "[".time()."] Found $countCourses courses$htmlEOL".PHP_EOL;
$md5Hashes = [];
$md5Sizes = [];
$totalDocs = 0;
$totalDocsSize = 0;
$uniqueDocs = 0;
$duplicateDocsCount = 0;
$totalDuplicateDocs = 0;
$courseCodeDirMatch = [];
$sysCoursePath = api_get_path(SYS_COURSE_PATH);
// Search for duplicate tests, by looking for tests that have the exact same
// title in the same course
if ($debug) {
echo "[".time()."] Iterating on courses...$htmlEOL".PHP_EOL;
}
while ($course = Database::fetch_assoc($resCourse)) {
if ($debug) {
echo "Course ".$course['id'].' ('.$course['code'].')..'.$htmlEOL.PHP_EOL;
}
$courseCodeDirMatch[$course['code']] = $course['directory'];
$courseDir = $course['directory'].'/document';
$baseWorkDir = $sysCoursePath.$courseDir;
$totalDocs += _scanSubDirs($baseWorkDir, $md5Hashes, $md5Sizes, $course['code'], $ignoreTemplates);
} // end while on course
// Sort array by sizes
arsort($md5Sizes, SORT_NUMERIC);
echo "[".time()."] Here is a list of duplicate files, joined together$htmlEOL".PHP_EOL;
echo " with a clickable link to each document:$htmlEOL".PHP_EOL;
foreach ($md5Sizes as $hash => $size) {
$files = $md5Hashes[$hash];
$countFiles = $realFilesForThisHash = count($files);
$realFilePath = '';
$i = 0;
if ($countFiles > 1) {
$kSize = floor((int) $files[0]['size']/1024);
echo ($html ? '' : '').$hash.", size: ".$kSize."KB ($countFiles copies): ".($html ? '' : 0).$htmlEOL.PHP_EOL;
if ($html) {
echo "";
}
} else {
continue;
}
foreach ($files as $file) {
if ($file['link']) {
// This is a link. Discount from real files
// and do not count as potential space savings
$realFilesForThisHash--;
$totalLinks++;
} else {
if ($i != 0) {
// We don't count the first file as a duplicate
// nor as potential space savings
$totalDuplicateDocs++;
$totalDocsSize += $file['size'];
}
if ($html) {
echo "- ".$file['subpath']."(".$file['course'].")
".PHP_EOL;
} else {
echo $webCode.$file['course'].", as ".$file['path'].$htmlEOL.PHP_EOL;
}
$i++;
}
}
if ($html) {
echo "
";
}
if ($realFilesForThisHash > 0) {
// if at least one of those was really a file, count it as unique
if ($debug) {
echo "$hash (".$file['path'].") has $realFilesForThisHash duplicates".PHP_EOL;
}
$uniqueDocs++;
}
}
$duplicateDocsCount = $totalDocs - $uniqueDocs;
$sizeInMB = floor((int) $totalDocsSize / (1024*1024));
echo "[".time()."] Found $totalDocs docs in total (including links).".PHP_EOL;
echo " Only $uniqueDocs were original files with duplicates.".PHP_EOL;
echo " $totalDuplicateDocs files remain, as duplicates versions of some of those $uniqueDocs.".PHP_EOL;
echo " Potential savings: ~$sizeInMB MB.".PHP_EOL;
echo "";
/**
* Scans the given directory and its subdirectories and returns a number of
* files found
* @param string $path
* @param array &$md5Hashes
* @param string $courseCode
* @param bool $ignoreTemplates
* @return int
*/
function _scanSubDirs(string $path, array &$md5Hashes, array &$md5Sizes, string $courseCode, bool $ignoreTemplates = false): int
{
global $debug;
$count = 0;
if (!is_dir($path)) {
return 0;
}
if (substr(basename($path), 0, 1) === '.') {
// If last path component starts with a '.', ignore
return 0;
}
if ($ignoreTemplates) {
if (preg_match('#/document/(audio|flash|images|video)#', $path)) {
if ($debug) {
echo $path." is part of the template, skipping".PHP_EOL;
}
return 0;
}
}
$list = scandir($path);
foreach ($list as $entry) {
if (substr($entry, 0, 1) === '.') {
// If the entry starts with a '.', ignore
continue;
}
$subPath = $path.'/'.$entry;
if ($ignoreTemplates && preg_match('#/document/index.html$#', $subPath)) {
continue;
}
if (is_dir($subPath)) {
$count += _scanSubDirs($subPath, $md5Hashes, $md5Sizes, $courseCode, $ignoreTemplates);
} else {
$fileMd5 = md5_file($subPath);
if ($fileMd5 === false) {
if ($debug) {
echo 'Skipping: There was an error calculating MD5 of '.$subPath.PHP_EOL;
}
continue;
}
$fileSize = filesize($subPath);
if ($fileSize > 0) {
if (!isset($md5Hashes[$fileMd5])) {
$md5Hashes[$fileMd5] = [];
}
$matches = [];
preg_match('#/document/(.*)#', $subPath, $matches);
$isLink = is_link($subPath);
$md5Hashes[$fileMd5][] = [
'course' => $courseCode,
'link' => $isLink,
'path' => $subPath,
'size' => $fileSize,
'subpath' => $matches[1],
];
$md5Sizes[$fileMd5] = $fileSize;
} else {
if ($debug) {
echo 'Skipping: Filesize 0 for '.$subPath.PHP_EOL;
}
continue;
}
$count++;
}
}
return $count;
}