#!/usr/bin/perl -w use strict; use utf8; use LWP::Simple; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use HTML::LinkExtor; use Encode; # Subroutine prototypes. sub ParseText($); # Returns a string containing the normalized document without HTML markup. sub ParseDescription($); # Returns a string containing the description of the HTML document. sub GenerateIndex(); # Generates an index of the pages crawled. sub GetBase($); # Gets the base part of a URL path. sub AddToRobotFilter($); # Adds a domain's robots.txt file to the exclude list. sub IsExcluded($); # Returns a boolean value indicating whether a page should be visited. sub LoadKeywords($); # Reads in a keywords file (assumed to be newline delimited). sub GetPageRanking($); # Returns a value between 0 and 1 for the ranking of a page. sub StripHTML($); # Gets rid of all HTML tags from a document. # Settings my $KEYWORDS_FILE = "pokemon_keywords.txt"; my $MIN_THRESHOLD = 0.5; my $START_URL = "http://mondo123.tripod.com/main.html"; my $MAX_SPREAD = 3; my $MAX_RESULTS = 30; my $OUTPUT_FOLDER = "pokemon_output"; # Initialize the directory my $oDirectory; if (!mkdir($OUTPUT_FOLDER, 0777) && !opendir($oDirectory, $OUTPUT_FOLDER)) { die "Can't open output directory."; } # Initialize page getter. my $browser = LWP::UserAgent->new(); $browser->timeout(15); # Get list of keywords. my %keywords; LoadKeywords($KEYWORDS_FILE); # Initialize robot filter. my %robotpages = (); my %excludelist = (); AddToRobotFilter($START_URL); # Initialize page queue and history. my @pagequeue; push (@pagequeue, $START_URL); my %pagehistory; $pagehistory{$START_URL} = 0; # Initialize index. my %PageIndex; my $numResults = 0; # While there are pages left to crawl and we have slots left to fill. while (@pagequeue != 0 && $numResults < $MAX_RESULTS) { # Get the page we will be working on. my $currentURL = pop (@pagequeue); print @pagequeue." $currentURL\n"; # Download the web page and go on to the next if can't. my $request = HTTP::Request->new(GET => $currentURL); my $response = $browser->request($request); if ($response->is_error()) { print "\t\t".$response->status_line."\n"; next; } my $contents = $response->content(); # Get page ranking. my $pageText = ParseText($contents); my $ranking = GetPageRanking($pageText); print "\t\t$ranking\n"; # Save this page if it has a good ranking. if ($ranking >= $MIN_THRESHOLD) { open (FILEOUT, ">", "$OUTPUT_FOLDER/$numResults.txt"); print FILEOUT StripHTML($contents); close (FILEOUT); my $formattedRanking = sprintf("%.4f", $ranking); if (! defined $PageIndex{$formattedRanking}) { $PageIndex{$formattedRanking} = "$numResults^pp^".ParseDescription($contents); } else { $PageIndex{$formattedRanking} .= "&_&$numResults^pp^".ParseDescription($contents); } $numResults++; } # Push the links on this page if it has a good score or if the spread is still low. if ($ranking >= $MIN_THRESHOLD || $pagehistory{$currentURL} < $MAX_SPREAD) { # Ranking which all of the pages that connect to this one will recieve. my $linkSpread; if ($ranking >= $MIN_THRESHOLD) { $linkSpread = 1; } else { $linkSpread = $pagehistory{$currentURL} + 1; } # Extract links and resolve to current page. my $page_parser = HTML::LinkExtor->new(undef, $currentURL); $page_parser->parse(decode_utf8 $contents)->eof; # Process all of the links on the page. my @links = $page_parser->links; foreach my $link (@links) { my $currLink = $$link[2]; # Make sure the link is HTML. $currLink =~ s/\?.*//; $currLink =~ s/\#.*//; if ($currLink =~ m/(\/|\/[^\.\/]+|\.html?|\.shtml?|\.cgi|\.jsp| \.asp|\.aspx|\.php|\.pl|\.cfm)[^\/.]*$/) { # Add the page to the queue if it hasn't already been added. if (! defined $pagehistory{$currLink} && $pagehistory{$currentURL} < 2 && !IsExcluded($currLink)) { if (! defined $robotpages{GetBase($currLink)}) { AddToRobotFilter($currentURL); } $pagehistory{$currLink} = $linkSpread; push (@pagequeue, $currLink); } } } } } GenerateIndex(); ##################################################################################### sub ParseText($) { my $text = ""; if ($_[0] =~ m/\