Script: Add script to detect duplicate files by MD5 sum - refs BT#19673

3 years ago · 9971bb0d99
parent be0467274d
commit 9971bb0d99
1 changed files with 215 additions and 0 deletions
--- a/tests/scripts/find_duplicate_documents_by_md5.php
+++ b/tests/scripts/find_duplicate_documents_by_md5.php
@ -0,0 +1,215 @@
+<?php
+/* For licensing terms, see /license.txt */
+/**
+ * This script finds duplicated documents to save disk space.
+ *
+ * It identifies duplicate documents by building an array of MD5 hashes
+ * from the app/courses/[CODE]/document/ folders on your system
+ * and checking for similar hashes everywhere in the other documents folders.
+ *
+ * This script should be located inside the tests/scripts/ folder to work
+ * on the command line. To run it in a browser, move it to main/inc/ and
+ * load it from there.
+ * It can be run more than one time as it will only ever affect duplicate
+ * documents.
+ *
+ * @author Yannick Warnier <yannick.warnier@beeznest.com>
+ */
+//exit; //remove this line to execute from the command line
+
+use ChamiloSession as Session;
+
+ini_set('memory_limit', '512M');
+
+require_once __DIR__.'/../../main/inc/global.inc.php';
+
+$html = false;
+$htmlEOL = '';
+if (PHP_SAPI !== 'cli') {
+    $html = true;
+    $htmlEOL = '<br />';
+    api_protect_admin_script();
+    //die('This script can only be executed from the command line');
+}
+
+// Debug shows more output
+$debug = false;
+// Ignore template files
+$ignoreTemplates = true;
+$webCode = api_get_path(WEB_CODE_PATH).'document/document.php?cidReq=';
+
+if ($html) {
+    echo "<html><body>".$htmlEOL;
+}
+echo "[".time()."] Querying courses$htmlEOL\n";
+$sql = "SELECT id, code, directory FROM course order by id";
+
+$resCourse = Database::query($sql);
+if ($resCourse === false) {
+    exit('Could not find any course'.PHP_EOL);
+}
+$countCourses = Database::num_rows($resCourse);
+echo "[".time()."] Found $countCourses courses$htmlEOL".PHP_EOL;
+
+$md5Hashes = [];
+$md5Sizes = [];
+$totalDocs = 0;
+$totalDocsSize = 0;
+$uniqueDocs = 0;
+$duplicateDocsCount = 0;
+$totalDuplicateDocs = 0;
+$courseCodeDirMatch = [];
+$sysCoursePath = api_get_path(SYS_COURSE_PATH);
+
+// Search for duplicate tests, by looking for tests that have the exact same
+// title in the same course
+if ($debug) {
+    echo "[".time()."] Iterating on courses...$htmlEOL".PHP_EOL;
+}
+while ($course = Database::fetch_assoc($resCourse)) {
+    if ($debug) {
+        echo "Course ".$course['id'].' ('.$course['code'].')..'.$htmlEOL.PHP_EOL;
+    }
+    $courseCodeDirMatch[$course['code']] = $course['directory'];
+    $courseDir = $course['directory'].'/document';
+    $baseWorkDir = $sysCoursePath.$courseDir;
+    $totalDocs += _scanSubDirs($baseWorkDir, $md5Hashes, $md5Sizes, $course['code'], $ignoreTemplates);
+} // end while on course
+
+// Sort array by sizes
+arsort($md5Sizes, SORT_NUMERIC);
+
+echo "[".time()."] Here is a list of duplicate files, joined together$htmlEOL".PHP_EOL;
+echo "             with a clickable link to each document:$htmlEOL".PHP_EOL;
+foreach ($md5Sizes as $hash => $size) {
+    $files = $md5Hashes[$hash];
+    $countFiles = $realFilesForThisHash = count($files);
+    $realFilePath = '';
+    $i = 0;
+    if ($countFiles > 1) {
+        $kSize = floor((int) $files[0]['size']/1024);
+        echo ($html ? '<b>' : '').$hash.", size: ".$kSize."KB ($countFiles copies): ".($html ? '</b>' : 0).$htmlEOL.PHP_EOL;
+        if ($html) {
+            echo "<ul>";
+        }
+    } else {
+        continue;
+    }
+    foreach ($files as $file) {
+        if ($file['link']) {
+            // This is a link. Discount from real files
+            // and do not count as potential space savings
+            $realFilesForThisHash--;
+            $totalLinks++;
+        } else {
+            if ($i != 0) {
+                // We don't count the first file as a duplicate
+                // nor as potential space savings
+                $totalDuplicateDocs++;
+                $totalDocsSize += $file['size'];
+            }
+            if ($html) {
+                echo "<li><a target='_blank' href='".$webCode.$file['course']."'>".$file['subpath']."(".$file['course'].")</a></li>".PHP_EOL;
+            } else {
+                echo $webCode.$file['course'].", as ".$file['path'].$htmlEOL.PHP_EOL;
+            }
+            $i++;
+        }
+    }
+    if ($html) {
+        echo "</ul>";
+    }
+    if ($realFilesForThisHash > 0) {
+        // if at least one of those was really a file, count it as unique
+        if ($debug) {
+            echo "$hash (".$file['path'].") has $realFilesForThisHash duplicates".PHP_EOL;
+        }
+        $uniqueDocs++;
+    }
+}
+$duplicateDocsCount = $totalDocs - $uniqueDocs;
+
+$sizeInMB = floor((int) $totalDocsSize / (1024*1024));
+echo "[".time()."] Found $totalDocs docs in total (including links).".PHP_EOL;
+echo "             Only $uniqueDocs were original files with duplicates.".PHP_EOL;
+echo "             $totalDuplicateDocs files remain, as duplicates versions of some of those $uniqueDocs.".PHP_EOL;
+echo "             Potential savings: ~$sizeInMB MB.".PHP_EOL;
+
+echo "</body></html>";
+
+/**
+ * Scans the given directory and its subdirectories and returns a number of
+ * files found
+ * @param string $path
+ * @param array &$md5Hashes
+ * @param string $courseCode
+ * @param bool $ignoreTemplates
+ * @return int
+ */
+function _scanSubDirs(string $path, array &$md5Hashes, array &$md5Sizes, string $courseCode, bool $ignoreTemplates = false): int
+{
+    global $debug;
+    $count = 0;
+    if (!is_dir($path)) {
+        return 0;
+    }
+    if (substr(basename($path), 0, 1) === '.') {
+        // If last path component starts with a '.', ignore
+        return 0;
+    }
+    if ($ignoreTemplates) {
+        if (preg_match('#/document/(audio|flash|images|video)#', $path)) {
+            if ($debug) {
+                echo $path." is part of the template, skipping".PHP_EOL;
+            }
+            return 0;
+        }
+    }
+    $list = scandir($path);
+    foreach ($list as $entry) {
+        if (substr($entry, 0, 1) === '.') {
+            // If the entry starts with a '.', ignore
+            continue;
+        }
+        $subPath = $path.'/'.$entry;
+        if ($ignoreTemplates && preg_match('#/document/index.html$#', $subPath)) {
+            continue;
+        }
+        if (is_dir($subPath)) {
+            $count += _scanSubDirs($subPath, $md5Hashes, $md5Sizes, $courseCode, $ignoreTemplates);
+        } else {
+            $fileMd5 = md5_file($subPath);
+            if ($fileMd5 === false) {
+                if ($debug) {
+                    echo 'Skipping: There was an error calculating MD5 of '.$subPath.PHP_EOL;
+                }
+                continue;
+            }
+            $fileSize = filesize($subPath);
+            if ($fileSize > 0) {
+                if (!isset($md5Hashes[$fileMd5])) {
+                    $md5Hashes[$fileMd5] = [];
+                }
+                $matches = [];
+                preg_match('#/document/(.*)#', $subPath, $matches);
+                $isLink = is_link($subPath);
+                $md5Hashes[$fileMd5][] = [
+                    'course' => $courseCode,
+                    'link' => $isLink,
+                    'path' => $subPath,
+                    'size' => $fileSize,
+                    'subpath' => $matches[1],
+                ];
+                $md5Sizes[$fileMd5] = $fileSize;
+            } else {
+                if ($debug) {
+                    echo 'Skipping: Filesize 0 for '.$subPath.PHP_EOL;
+                }
+                continue;
+            }
+            $count++;
+        }
+    }
+
+    return $count;
+}