Results 1 to 3 of 3

Thread: PHP Bot(Webcrawlers)

  1. #1

    Thread Starter
    Member
    Join Date
    Jun 2007
    Posts
    59

    PHP Bot(Webcrawlers)

    Hi all!

    I am developing a Search Engine(Which is done)

    but now I need to crawl the web with Bots/spiders etc

    But I am unsure how to or where to start, so if anyone could assist me it would be very helpfull thanks in advance!

    ~Tiv

  2. #2
    WiggleWiggle dclamp's Avatar
    Join Date
    Aug 2006
    Posts
    3,527

    Re: PHP Bot(Webcrawlers)

    try something from google
    My usual boring signature: Something

  3. #3
    Hyperactive Member
    Join Date
    Dec 2006
    Location
    Ubuntu Haters Club
    Posts
    405

    Re: PHP Bot(Webcrawlers)

    Wow, this is old and brings back some bad memories. It's awful but might be some place to start:
    PHP Code:
    #!/usr/bin/php
    <?php
    include("/home/default/geego.co.uk/user/library/mysql.php");

    ob_start();

    ini_set('user_agent'"Mozilla/5.0 (compatible; Geegobot/0.1; +http://www.geego.co.uk)");

    function 
    index($url$link$showlinks$bias=1) {
            
    $contents file_get_contents($url);
            
    eregi("<title>(.*)</title>"$contents$title);
            
    $title $title[1];
            
    eregi("<body[^>]*>(.*)</body>"$contents$body);
            
    $body $body[1];
            
    $stripped_body ereg_replace("<img[^>]*>"""$body);
            
    $stripped_body ereg_replace("</img>"""$stripped_body);
            
    preg_match_all("/<h1[^>]*>(.*?)<\/h1>/im"$stripped_body$h1s);
            
    // Calculate Keyword String
            
    $keywords "";
            foreach(
    $h1s[1] As $h1) {
                    
    $keywords .= strip_tags($h1).", ";
            }
            
    $keywords substr_replace($keywords"", -2);

            
    preg_match_all("/<a href=\"(.*?)\"[^>]*>(.*?)<\/a>/im"$stripped_body$as);
            
    // Calculate Keyword String
            
    $links "";
        
    $retas = array();
            foreach(
    $as[1] As $a) {
                    if(
    strpos($a'mailto:') === false) {
                            if(
    strpos($a'http://') !== false) {
                    
    $retas[] = $a;
                    if(
    $showlinks == 1) {
                        echo 
    'found link \''.$a."'\n\t- Queued for Index\n";
                        
    ob_flush();
                        
    flush();
                    }
                                    
    $links .= "\t\t<link>".strip_tags($a)."</link>\n";
                            }
                    }
            }
            
    //$links = substr_replace($links, "", -2);
            
    preg_match_all("/<h2[^>]*>(.*?)<\/h2>/im"$stripped_body$h2s);
            
    // Calculate Less Valuble Keyword String
            
    $lkeywords "";
            foreach(
    $h2s[1] As $h2) {
                    
    $lkeywords .= strip_tags($h2).", ";
            }
            
    $lkeywords substr_replace($lkeywords"", -2);

    //"/<h1[^>]*>(.*?)<\/h1>/i", $stripped_body, $main_header);
            //preg_match("/<h2[^>]*>(.*?)<\/h2>/i", $stripped_body, $sub_header);
    //      ereg("<h1[^>]*>(.*)</h1>", $stripped_body, $main_header);
            
    $stripped_body preg_replace("/<form[^>]*>(.*?)<\/form>/"""$stripped_body);
            
    $stripped_body preg_replace("/<object[^>]*>(.*?)<\/object>/"""$stripped_body);
            
    $stripped_body preg_replace("/<script[^>]*>(.*?)<\/script>/"""$stripped_body);
            
    $stripped_body preg_replace('/<!--(.|\s)*?-->/'''$stripped_body);
            
    $stripped_body str_replace("Ã"""$stripped_body);
            
    $stripped_body strip_tags($stripped_body);
            
    $stripped_body htmlentities($stripped_body);
            
    $stripped_body str_replace("&nbsp;"""$stripped_body);
        
    $stripped_body str_replace("&nbsp;"""$stripped_body);
            
    $stripped_body str_replace("&acirc;""&amp;acirc;"$stripped_body);
            
    $stripped_body str_replace("&cent;""&amp;cent;"$stripped_body);
            
    $stripped_body str_replace("&oacute;""&amp;oacute;"$stripped_body);
            
    $stripped_body str_replace("&eacute;""&amp;eacute;"$stripped_body);
            
    $stripped_body str_replace("&aacute;""&amp;aacute;"$stripped_body);
            
    $stripped_body str_replace("&ntilde;""&amp;ntilde;"$stripped_body);
            
    $stripped_body urldecode(str_replace("%EF%BF%BD"""urlencode($stripped_body)));
            
    $stripped_body str_replace("&pound;""&amp;pound;"$stripped_body);
            
    $xmlinsert "<?xml version=\"1.0\" ?>\n";
            
    $xmlinsert .= "<geego_index>\n";
            
    $xmlinsert .= "\t<title>".htmlentities($title)."</title>\n";
            
    $xmlinsert .= "\t<links>".$links."</links>\n";
            
    $xmlinsert .= "\t<keywords>".htmlentities($keywords)."</keywords>\n";
            
    $xmlinsert .= "\t<lesser_keywords>".htmlentities($lkeywords)."</lesser_keywords>\n";
            
    $xmlinsert .= "\t<description>".substr($stripped_body0150)."</description>\n";
            
    $xmlinsert .= "\t<body>".$stripped_body."</body>\n";
            
    $xmlinsert .= "\t<result>".(isset($lkeywords$title$stripped_body$keywords)?"Success":"Warning")."</result>\n";
            
    $xmlinsert .= "</geego_index>";
            
    //ob_end_clean();
            // Actually add the site to our index
            // Check that it doesn't already exist!!!!!!
            
    $query dbquery("SELECT * FROM `geego_index` WHERE `site_address` = '".$url."'"$link);
            if(
    mysql_num_rows($query) != 0) {
            if(
    $showlinks == 1) {
                return 
    $retas;
            } else {
                return 
    false;
            }
            }
        
    dbquery("INSERT INTO `geego_index` VALUES (null, '".$url."', '".addslashes($title)."', '".addslashes($stripped_body)."', '".addslashes(substr($stripped_body0150))."', UNIX_TIMESTAMP(), '".addslashes($keywords.($lkeywords != "" ?', '.$lkeywords:''))."', 'force', 'Y', '".$bias."', '".addslashes($xmlinsert)."');"$link);
        if(
    $showlinks == 1) {
            return 
    $retas;
        } else {
            return 
    true;
        }
    }    

    if (
    in_array($argv[1], array('--help''-help''-h''-?'))) {
    ?>

      Usage:
      <?php echo $argv[0]; ?> [-r, -f] <URL>

      Will index <URL> into the Geego Search Engine Project.

      -r, --recursive, -f, --follow all make the indexing engine
      follow ALL links (Excluding relative links and mailto's).

      NOTE: THIS IS ONLY ONCE RECURSIVE.

      The multiple options are reserved for consistency.

      Copyright Rudi Visser @ Geego Project 2007.

    <?php
    } elseif ($argc == && in_array($argv[1], array('-r''--recursive''-f''--follow')) && strpos($argv[2], "http") !== FALSE 
        
    && in_array($argv[3], array("-p"))) {
        echo 
    'Indexing '.$argv[2]."\n";
        echo 
    'Starting Index @ '.$argv[2]."\n";
        
    $queue index($argv[2], $link1$argv[4]);
        if(
    $queue != false) {
            echo 
    'Index Completed Successfully'."\n";
        } else {
            echo 
    "----------\n";
            echo 
    'INDEX TOTAL FAILURE'."\n";
            echo 
    "----------\n\n";
            exit;
        }
        echo 
    '------------ PROCESSING QUEUE ------------'."\n\n";
        foreach(
    $queue As $http) {
            echo 
    'INDEXING: \''.$http."'\n";
            if(
    index($http$link0)) {
                echo 
    "success\n";
            } else {
                echo 
    "total failure\n";
            }
            echo 
    'ENDINDEX \''.$http."'\n";
            
    ob_flush();
            
    flush();
        }
    } elseif (
    $argc == && in_array($argv[1], array('-r''--recursive''-f''--follow')) && strpos($argv[2], "http") !== FALSE) {
        echo 
    'Indexing '.$argv[2]."\n";
        echo 
    'Starting Index @ '.$argv[2]."\n";
        
    $queue index($argv[2], $link1);
        if(
    $queue != false) {
            echo 
    'Index Completed Successfully'."\n";
        } else {
            echo 
    "----------\n";
            echo 
    'INDEX TOTAL FAILURE'."\n";
            echo 
    "----------\n\n";
            exit;
        }
        echo 
    '------------ PROCESSING QUEUE ------------'."\n\n";
        foreach(
    $queue As $http) {
            echo 
    'INDEXING: \''.$http."'\n";
            if(
    index($http$link0)) {
                echo 
    "success\n";
            } else {
                echo 
    "total failure\n";
            }
            echo 
    'ENDINDEX \''.$http."'\n";
            
    ob_flush();
            
    flush();
        }
    } else { 
    ?>
        <?=$argc."\n"?>
        <?=$argv[0]."\n"?>
        <?=$argv[1]."\n"?>
        <?=$argv[2]."\n"?>
        echo "\n";
    <?php
    }
    $contents ob_get_contents();
    $fp = @fopen("./lastindex.log"'w');
    @
    fwrite($fp$contents);
    @
    fclose($fp);
    ob_end_flush();
    ?>
    » Twitter: @rudi_visser : Website: www.rudiv.se «

    If Apple fixes security flaws, they are heralded as proactive. If Microsoft fixes a security flaw, they finally got around to fixing their buggy OS.

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  



Click Here to Expand Forum to Full Width