Tivoilos
May 20th, 2008, 09:19 AM
Hi all!
I am developing a Search Engine(Which is done)
but now I need to crawl the web with Bots/spiders etc
But I am unsure how to or where to start, so if anyone could assist me it would be very helpfull thanks in advance!
~Tiv
dclamp
May 21st, 2008, 06:23 PM
try something from google (http://www.google.com/search?q=php+web+crawler&ie=utf-8&oe=utf-8&aq=t&rls=org.mozilla:en-US:official&client=firefox-a)
RudiVisser
May 28th, 2008, 02:02 PM
Wow, this is old and brings back some bad memories. It's awful but might be some place to start:
#!/usr/bin/php
<?php
include("/home/default/geego.co.uk/user/library/mysql.php");
ob_start();
ini_set('user_agent', "Mozilla/5.0 (compatible; Geegobot/0.1; +http://www.geego.co.uk)");
function index($url, $link, $showlinks, $bias=1) {
$contents = file_get_contents($url);
eregi("<title>(.*)</title>", $contents, $title);
$title = $title[1];
eregi("<body[^>]*>(.*)</body>", $contents, $body);
$body = $body[1];
$stripped_body = ereg_replace("<img[^>]*>", "", $body);
$stripped_body = ereg_replace("</img>", "", $stripped_body);
preg_match_all("/<h1[^>]*>(.*?)<\/h1>/im", $stripped_body, $h1s);
// Calculate Keyword String
$keywords = "";
foreach($h1s[1] As $h1) {
$keywords .= strip_tags($h1).", ";
}
$keywords = substr_replace($keywords, "", -2);
preg_match_all("/<a href=\"(.*?)\"[^>]*>(.*?)<\/a>/im", $stripped_body, $as);
// Calculate Keyword String
$links = "";
$retas = array();
foreach($as[1] As $a) {
if(strpos($a, 'mailto:') === false) {
if(strpos($a, 'http://') !== false) {
$retas[] = $a;
if($showlinks == 1) {
echo 'found link \''.$a."'\n\t- Queued for Index\n";
ob_flush();
flush();
}
$links .= "\t\t<link>".strip_tags($a)."</link>\n";
}
}
}
//$links = substr_replace($links, "", -2);
preg_match_all("/<h2[^>]*>(.*?)<\/h2>/im", $stripped_body, $h2s);
// Calculate Less Valuble Keyword String
$lkeywords = "";
foreach($h2s[1] As $h2) {
$lkeywords .= strip_tags($h2).", ";
}
$lkeywords = substr_replace($lkeywords, "", -2);
//"/<h1[^>]*>(.*?)<\/h1>/i", $stripped_body, $main_header);
//preg_match("/<h2[^>]*>(.*?)<\/h2>/i", $stripped_body, $sub_header);
// ereg("<h1[^>]*>(.*)</h1>", $stripped_body, $main_header);
$stripped_body = preg_replace("/<form[^>]*>(.*?)<\/form>/", "", $stripped_body);
$stripped_body = preg_replace("/<object[^>]*>(.*?)<\/object>/", "", $stripped_body);
$stripped_body = preg_replace("/<script[^>]*>(.*?)<\/script>/", "", $stripped_body);
$stripped_body = preg_replace('/<!--(.|\s)*?-->/', '', $stripped_body);
$stripped_body = str_replace("Ã", "", $stripped_body);
$stripped_body = strip_tags($stripped_body);
$stripped_body = htmlentities($stripped_body);
$stripped_body = str_replace(" ", "", $stripped_body);
$stripped_body = str_replace(" ", "", $stripped_body);
$stripped_body = str_replace("â", "&acirc;", $stripped_body);
$stripped_body = str_replace("¢", "&cent;", $stripped_body);
$stripped_body = str_replace("ó", "&oacute;", $stripped_body);
$stripped_body = str_replace("é", "&eacute;", $stripped_body);
$stripped_body = str_replace("á", "&aacute;", $stripped_body);
$stripped_body = str_replace("ñ", "&ntilde;", $stripped_body);
$stripped_body = urldecode(str_replace("%EF%BF%BD", "", urlencode($stripped_body)));
$stripped_body = str_replace("£", "&pound;", $stripped_body);
$xmlinsert = "<?xml version=\"1.0\" ?>\n";
$xmlinsert .= "<geego_index>\n";
$xmlinsert .= "\t<title>".htmlentities($title)."</title>\n";
$xmlinsert .= "\t<links>".$links."</links>\n";
$xmlinsert .= "\t<keywords>".htmlentities($keywords)."</keywords>\n";
$xmlinsert .= "\t<lesser_keywords>".htmlentities($lkeywords)."</lesser_keywords>\n";
$xmlinsert .= "\t<description>".substr($stripped_body, 0, 150)."</description>\n";
$xmlinsert .= "\t<body>".$stripped_body."</body>\n";
$xmlinsert .= "\t<result>".(isset($lkeywords, $title, $stripped_body, $keywords)?"Success":"Warning")."</result>\n";
$xmlinsert .= "</geego_index>";
//ob_end_clean();
// Actually add the site to our index
// Check that it doesn't already exist!!!!!!
$query = dbquery("SELECT * FROM `geego_index` WHERE `site_address` = '".$url."'", $link);
if(mysql_num_rows($query) != 0) {
if($showlinks == 1) {
return $retas;
} else {
return false;
}
}
dbquery("INSERT INTO `geego_index` VALUES (null, '".$url."', '".addslashes($title)."', '".addslashes($stripped_body)."', '".addslashes(substr($stripped_body, 0, 150))."', UNIX_TIMESTAMP(), '".addslashes($keywords.($lkeywords != "" ?', '.$lkeywords:''))."', 'force', 'Y', '".$bias."', '".addslashes($xmlinsert)."');", $link);
if($showlinks == 1) {
return $retas;
} else {
return true;
}
}
if (in_array($argv[1], array('--help', '-help', '-h', '-?'))) {
?>
Usage:
<?php echo $argv[0]; ?> [-r, -f] <URL>
Will index <URL> into the Geego Search Engine Project.
-r, --recursive, -f, --follow all make the indexing engine
follow ALL links (Excluding relative links and mailto's).
NOTE: THIS IS ONLY ONCE RECURSIVE.
The multiple options are reserved for consistency.
Copyright Rudi Visser @ Geego Project 2007.
<?php
} elseif ($argc == 5 && in_array($argv[1], array('-r', '--recursive', '-f', '--follow')) && strpos($argv[2], "http") !== FALSE
&& in_array($argv[3], array("-p"))) {
echo 'Indexing '.$argv[2]."\n";
echo 'Starting Index @ '.$argv[2]."\n";
$queue = index($argv[2], $link, 1, $argv[4]);
if($queue != false) {
echo 'Index Completed Successfully'."\n";
} else {
echo "----------\n";
echo 'INDEX TOTAL FAILURE'."\n";
echo "----------\n\n";
exit;
}
echo '------------ PROCESSING QUEUE ------------'."\n\n";
foreach($queue As $http) {
echo 'INDEXING: \''.$http."'\n";
if(index($http, $link, 0)) {
echo "success\n";
} else {
echo "total failure\n";
}
echo 'ENDINDEX \''.$http."'\n";
ob_flush();
flush();
}
} elseif ($argc == 3 && in_array($argv[1], array('-r', '--recursive', '-f', '--follow')) && strpos($argv[2], "http") !== FALSE) {
echo 'Indexing '.$argv[2]."\n";
echo 'Starting Index @ '.$argv[2]."\n";
$queue = index($argv[2], $link, 1);
if($queue != false) {
echo 'Index Completed Successfully'."\n";
} else {
echo "----------\n";
echo 'INDEX TOTAL FAILURE'."\n";
echo "----------\n\n";
exit;
}
echo '------------ PROCESSING QUEUE ------------'."\n\n";
foreach($queue As $http) {
echo 'INDEXING: \''.$http."'\n";
if(index($http, $link, 0)) {
echo "success\n";
} else {
echo "total failure\n";
}
echo 'ENDINDEX \''.$http."'\n";
ob_flush();
flush();
}
} else { ?>
<?=$argc."\n"?>
<?=$argv[0]."\n"?>
<?=$argv[1]."\n"?>
<?=$argv[2]."\n"?>
echo "\n";
<?php
}
$contents = ob_get_contents();
$fp = @fopen("./lastindex.log", 'w');
@fwrite($fp, $contents);
@fclose($fp);
ob_end_flush();
?>