*/
exit; //remove this line to execute from the command line
use ChamiloSession as Session;
ini_set('memory_limit', '512M');
require_once __DIR__.'/../../main/inc/global.inc.php';
$html = false;
$htmlEOL = '';
if (PHP_SAPI !== 'cli') {
    $html = true;
    $htmlEOL = '
';
    api_protect_admin_script();
    //die('This script can only be executed from the command line');
}
// Debug shows more output
$debug = false;
// Ignore template files
$ignoreTemplates = true;
$webCode = api_get_path(WEB_CODE_PATH).'document/document.php?cidReq=';
if ($html) {
    echo "
".$htmlEOL;
}
echo "[".time()."] Querying courses$htmlEOL\n";
$sql = "SELECT id, code, directory FROM course order by id";
$resCourse = Database::query($sql);
if ($resCourse === false) {
    exit('Could not find any course'.PHP_EOL);
}
$countCourses = Database::num_rows($resCourse);
echo "[".time()."] Found $countCourses courses$htmlEOL".PHP_EOL;
$md5Hashes = [];
$md5Sizes = [];
$totalDocs = 0;
$totalDocsSize = 0;
$uniqueDocs = 0;
$duplicateDocsCount = 0;
$totalDuplicateDocs = 0;
$courseCodeDirMatch = [];
$sysCoursePath = api_get_path(SYS_COURSE_PATH);
// Search for duplicate tests, by looking for tests that have the exact same
// title in the same course
if ($debug) {
    echo "[".time()."] Iterating on courses...$htmlEOL".PHP_EOL;
}
while ($course = Database::fetch_assoc($resCourse)) {
    if ($debug) {
        echo "Course ".$course['id'].' ('.$course['code'].')..'.$htmlEOL.PHP_EOL;
    }
    $courseCodeDirMatch[$course['code']] = $course['directory'];
    $courseDir = $course['directory'].'/document';
    $baseWorkDir = $sysCoursePath.$courseDir;
    $totalDocs += _scanSubDirs($baseWorkDir, $md5Hashes, $md5Sizes, $course['code'], $ignoreTemplates);
} // end while on course
// Sort array by sizes
arsort($md5Sizes, SORT_NUMERIC);
echo "[".time()."] Here is a list of duplicate files, joined together$htmlEOL".PHP_EOL;
echo "             with a clickable link to each document:$htmlEOL".PHP_EOL;
foreach ($md5Sizes as $hash => $size) {
    $files = $md5Hashes[$hash];
    $countFiles = $realFilesForThisHash = count($files);
    $realFilePath = '';
    $i = 0;
    if ($countFiles > 1) {
        $kSize = floor((int) $files[0]['size']/1024);
        echo ($html ? '' : '').$hash.", size: ".$kSize."KB ($countFiles copies): ".($html ? '' : 0).$htmlEOL.PHP_EOL;
        if ($html) {
            echo "";
        }
    } else {
        continue;
    }
    foreach ($files as $file) {
        if ($file['link']) {
            // This is a link. Discount from real files
            // and do not count as potential space savings
            $realFilesForThisHash--;
            $totalLinks++;
        } else {
            if ($i != 0) {
                // We don't count the first file as a duplicate
                // nor as potential space savings
                $totalDuplicateDocs++;
                $totalDocsSize += $file['size'];
            }
            if ($html) {
                echo "- ".$file['subpath']."(".$file['course'].")
 ".PHP_EOL;
            } else {
                echo $webCode.$file['course'].", as ".$file['path'].$htmlEOL.PHP_EOL;
            }
            $i++;
        }
    }
    if ($html) {
        echo "
";
    }
    if ($realFilesForThisHash > 0) {
        // if at least one of those was really a file, count it as unique
        if ($debug) {
            echo "$hash (".$file['path'].") has $realFilesForThisHash duplicates".PHP_EOL;
        }
        $uniqueDocs++;
    }
}
$duplicateDocsCount = $totalDocs - $uniqueDocs;
$sizeInMB = floor((int) $totalDocsSize / (1024*1024));
echo "[".time()."] Found $totalDocs docs in total (including links).".PHP_EOL;
echo "             Only $uniqueDocs were original files with duplicates.".PHP_EOL;
echo "             $totalDuplicateDocs files remain, as duplicates versions of some of those $uniqueDocs.".PHP_EOL;
echo "             Potential savings: ~$sizeInMB MB.".PHP_EOL;
echo "";
/**
 * Scans the given directory and its subdirectories and returns a number of
 * files found
 * @param string $path
 * @param array &$md5Hashes
 * @param string $courseCode
 * @param bool $ignoreTemplates
 * @return int
 */
function _scanSubDirs(string $path, array &$md5Hashes, array &$md5Sizes, string $courseCode, bool $ignoreTemplates = false): int
{
    global $debug;
    $count = 0;
    if (!is_dir($path)) {
        return 0;
    }
    if (substr(basename($path), 0, 1) === '.') {
        // If last path component starts with a '.', ignore
        return 0;
    }
    if ($ignoreTemplates) {
        if (preg_match('#/document/(audio|flash|images|video)#', $path)) {
            if ($debug) {
                echo $path." is part of the template, skipping".PHP_EOL;
            }
            return 0;
        }
    }
    $list = scandir($path);
    foreach ($list as $entry) {
        if (substr($entry, 0, 1) === '.') {
            // If the entry starts with a '.', ignore
            continue;
        }
        $subPath = $path.'/'.$entry;
        if ($ignoreTemplates && preg_match('#/document/index.html$#', $subPath)) {
            continue;
        }
        if (is_dir($subPath)) {
            $count += _scanSubDirs($subPath, $md5Hashes, $md5Sizes, $courseCode, $ignoreTemplates);
        } else {
            $fileMd5 = md5_file($subPath);
            if ($fileMd5 === false) {
                if ($debug) {
                    echo 'Skipping: There was an error calculating MD5 of '.$subPath.PHP_EOL;
                }
                continue;
            }
            $fileSize = filesize($subPath);
            if ($fileSize > 0) {
                if (!isset($md5Hashes[$fileMd5])) {
                    $md5Hashes[$fileMd5] = [];
                }
                $matches = [];
                preg_match('#/document/(.*)#', $subPath, $matches);
                $isLink = is_link($subPath);
                $md5Hashes[$fileMd5][] = [
                    'course' => $courseCode,
                    'link' => $isLink,
                    'path' => $subPath,
                    'size' => $fileSize,
                    'subpath' => $matches[1],
                ];
                $md5Sizes[$fileMd5] = $fileSize;
            } else {
                if ($debug) {
                    echo 'Skipping: Filesize 0 for '.$subPath.PHP_EOL;
                }
                continue;
            }
            $count++;
        }
    }
    return $count;
}