diff --git a/tests/scripts/delete_duplicate_documents.php b/tests/scripts/delete_duplicate_documents.php new file mode 100644 index 0000000000..181d7f2de5 --- /dev/null +++ b/tests/scripts/delete_duplicate_documents.php @@ -0,0 +1,185 @@ + 9, so one digit is enough) + * - same course, same session (otherwise considered a different file, a voluntary copy) + * - each have entries in c_item_property because it was created legitimately + * Possible duplicates can be found with a query like: + * SELECT id, size, title, path FROM c_document WHERE c_id = 470 AND path like '%\__.%' ORDER BY path, title; + * This script should be located inside the tests/scripts/ folder to work. + * It can be run more than one time as it will only ever affect duplicate + * documents. + * If you have a very large number of documents, we recommend you temporarily + * comment out the api_item_property_update() calls in + * DocumentManager::deleteDocumentFromDb. + * Chances are there is not even a registry of those documents there in the + * first place (they were probably duplicated through a short process) and + * this is where most of the time is spent during deletion. + * @author Yannick Warnier + */ +exit; //remove this line to execute from the command line +use ChamiloSession as Session; + +ini_set('memory_limit', '256M'); + +if (PHP_SAPI !== 'cli') { + die('This script can only be executed from the command line'); +} + +require_once __DIR__.'/../../main/inc/global.inc.php'; + +$tests = []; + +// Debug shows more output and only does a fake run +$debug = false; +$_user['user_id'] = 1; +Session::write('_user', $_user); + +echo "[".time()."] Querying courses\n"; +$sql = "SELECT id, code FROM course order by id"; + +$resCourse = Database::query($sql); +if ($resCourse === false) { + exit('Could not find any course'.PHP_EOL); +} +$countCourses = Database::num_rows($resCourse); +echo "[".time()."] Found $countCourses courses".PHP_EOL; + +// Check all c_document.id = c_document.iid, otherwise cancel +$sql = "SELECT iid FROM c_document WHERE id != iid"; +$res = Database::query($sql); +if (Database::num_rows($res) > 0) { + echo "We have detected that some c_document.id do not match c_document.iid.".PHP_EOL; + echo "This can lead to serious inconsistencies in the execution of this script.".PHP_EOL; + echo "Please fix this issue first, then try this script again.".PHP_EOL; + exit; +} + +$duplicateDocsCount = 0; +$originalDocsCount = 0; +$deletedDocsCount = 0; +$docsWithTracking = 0; +$docsInLP = 0; +$deletedDocsSize = 0; + +// Search for duplicate tests, by looking for tests that have the exact same +// title in the same course +echo "[".time()."] Iterating on courses: "; +while ($course = Database::fetch_assoc($resCourse)) { + $course['real_id'] = $course['id']; + if ($debug) { + echo PHP_EOL."Course ".$course['id'].'..'.PHP_EOL; + } + $_course = api_get_course_info_by_id($course['id']); + $courseDir = $_course['directory'].'/document'; + $sysCoursePath = api_get_path(SYS_COURSE_PATH); + $baseWorkDir = $sysCoursePath.$courseDir; + // We consider duplicates in sessions to be highly improbable, as course + // copies that could have been broken are essentially made on base courses. + $sql2 = "SELECT iid, title, path, size FROM c_document + WHERE c_id = ".$course['id']." + AND (session_id = 0 OR session_id IS NULL) + AND filetype = 'file' + ORDER BY path desc, title, iid"; + $res2 = Database::query($sql2); + if ($res2 === false) { + die("Error querying docs in course code ".$course['code'].": ".Database::error($res2)."\n"); + } + + // Extract the root filename, which is not always the one without _%d at the end. + // Sometimes, the original has been deleted but there are still replicates. + $lastOriginalDocPath = ''; + $lastOriginalDocId = 0; + $lastOriginalDocSize = 0; + if (Database::num_rows($res2) > 0) { + while ($doc = Database::fetch_assoc($res2)) { + if ($debug) { + echo $doc['path'].PHP_EOL; + } + $matches = []; + $guessedOriginal = ''; + $notOriginal = preg_match('/(.*)_\d(\.[a-zA-Z0-9]{1,4})$/', $doc['path'], $matches); + + if ($notOriginal) { + if ($debug) { + echo "This looks like a copy".PHP_EOL; + } + $guessedOriginal = $matches[1].$matches[2]; + if ($debug) { + echo "The original would be ".$guessedOriginal.PHP_EOL; + } + } else { + if ($debug) { + echo "This looks like an original. Recording and moving on...".PHP_EOL; + } + $lastOriginalDocPath = $doc['path']; + $lastOriginalDocId = $doc['iid']; + $lastOriginalDocSize = $doc['size']; + $originalDocsCount++; + // Move directly to the next item + continue; + } + + if ($lastOriginalDocPath != $guessedOriginal) { + if ($debug) { + echo "The guessed original filename is different from the original, or the original could not be found. Skipping...".PHP_EOL; + } + // The title is different -> moving on to another doc, but + // recording new doc's details just in case + $lastOriginalDocPath = $doc['path']; + $lastOriginalDocId = $doc['iid']; + $lastOriginalDocSize = $doc['size']; + $originalDocsCount++; + } else { + // A likely duplicate... + // Only bother if the doc's internal ID is higher than the + // last original doc ID, which means this (duplicate) test + // has been created *after* the original. + if ($lastOriginalDocId < $doc['iid'] && $lastOriginalDocSize == $doc['size']) { + if ($debug) { + echo "This doc has been created after the original and has the same size. Good.".PHP_EOL; + } + // This duplicate document could have been seen or downloaded already, + // but this is not considered critical when deciding whether to clean + // it or not. + // It is, however, essential to make sure this duplicate document is + // not used from inside a learning path. + $sql4 = "SELECT lp_id FROM c_lp_item + WHERE c_id = ".$course['id']." + AND item_type = 'document' AND ref = ".$doc['iid']; + $res4 = Database::query($sql4); + if (0 === Database::num_rows($res4)) { + if ($debug) { + echo "The file is not used in any LP".PHP_EOL; + } else { + DocumentManager::delete_document($_course, $doc['path'], $baseWorkDir, null, $doc['iid']); + DocumentManager::purgeDocument($doc['iid'], $_course); + } + if ($debug) { + echo $doc['iid'].' deleted.'.PHP_EOL; + } + $deletedDocsCount++; + $deletedDocsSize += $doc['size']; + } else { + if ($debug) { + echo "This document is used from a learning path. Deletion cancelled.".PHP_EOL; + } + } + } + $duplicateDocsCount++; + } + } // end while on c_document + } +} // end while on course + +$sizeInMB = (int) $deletedDocsSize / (1024*1024); +echo "[".time()."] Found $originalDocsCount original docs and $duplicateDocsCount duplicate docs...".PHP_EOL; +echo "Of these duplicates, $docsInLP were included in learning paths.".PHP_EOL; +echo "Deleted $deletedDocsCount ($duplicateDocsCount - $docsInLP) docs for a total of $sizeInMB MB.".PHP_EOL;