Crawler - Trouver tous les liens du web
A base de regex, trouve tous les liens sur le web
Le fichier "links.txt" va contenir tous les liens trouvés.
Le fichier "visited.txt" va contenir tous les liens déjà visités.
<?php set_time_limit(0); $myFile1 = 'links.txt'; $myFile2 = 'visited.txt'; $links = array(); function searchInFile($nameOfFile, $id) { $handle = fopen($nameOfFile, 'r'); $valid = false; while (($buffer = fgets($handle)) !== false) { if (strpos($buffer, $id) !== false) { $valid = true; break; } } fclose($handle); return $valid; } function writeInFile($nameOfFile, $id) { $file = fopen($nameOfFile, 'a'); $id = $id . "\n"; fwrite($file, $id); fclose($file); } $ctx = stream_context_create(array('http'=> array( 'timeout' => 3, ) )); $url = $_GET['url']; $action = $_GET['action']; if ($action == 'visite') { if (!searchInFile($myFile1, $url)); { $content = file_get_contents($url, false, $ctx); $regex = "((https?|ftp)\:\/\/)?"; $regex .= "([a-z0-9+!*(),;?&=\$_.-]+(\:[a-z0-9+!*(),;?&=\$_.-]+)?@)?"; $regex .= "([a-z0-9-.]*)\.([a-z]{2,4})"; $regex .= "(\:[0-9]{2,5})?"; $regex .= "(\/([a-z0-9+\$_-]\.?)+)*\/?"; $regex .= "(\?[a-z+&\$_.-][a-z0-9;:@&%=+\/\$_.-]*)?"; $regex .= "(#[a-z_.-][a-z0-9+\$_.-]*)?"; $matches = array(); $pattern = "/$regex/"; preg_match_all($pattern, $content, $matches); foreach(array_values(array_unique($matches[0])) as $match) { $parts = parse_url($match); if (array_key_exists('host', $parts)) { $host = $parts['host']; if (!in_array($host, $links)) array_push($links, $host); } } //ajout en bdd foreach($links as $link) { if (!searchInFile($myFile1, $link)) writeInFile($myFile1, $link); } print_r($links); } } else if ($action == 'search') { $allLinks = file_get_contents($myFile1); $allLinksVisited = file_get_contents($myFile2); $links = explode("\n", $allLinks); $linksVisited = explode("\n", $allLinksVisited); $linksToDelete = array_intersect($links, $linksVisited); //Liens déjà visités $linksToVisit = array_diff($links, $linksToDelete); //Liens à visiter foreach($linksToVisit as $link) { if (!searchInFile($myFile2, $link)) { writeInFile($myFile2, $link); file_get_contents('http://dleloup.alwaysdata.net/contents/tools/links.php?action=visite&url=http://' . $link, false, $ctx); } } } ?>