#!/usr/bin/perl -w # This Script downloads the Directory Information from Dhiraagu Edirectory Website # # LICENSE: Feel Free to modify to suit your needs...but pls keep this header ! # FourThirty Inc. 2008 # # by Typos v 0.2 -- typos@fourthirty.org # 22-05-2008 # # CHANGELOG ----- # * Has the ability to continue when invoked with -C or -c # * Few bug fixes to work with the new version of Dhiraagu site # --------------- # print "Edir v 0.2 by Typos:\n" ; use strict; use lib "/common"; use LWP; my $page = 0 ; my $cont = 0 ; my $cont_post = 0 ; # Checks whether edir is invoked with -C option foreach(@ARGV) { if ($_ eq '-c' || $_ eq '-C') { $cont = 1 ; my $file = '.edir.tmp' ; # reads .edir.tmp open EDIR, $file or die "Could not open '$file': $!\n" ; my @content = ; close EDIR ; # sets $page to start from last stopped foreach (@content) { if (/nextPage/) { m/index\.php\?action=edir&page=(\d*)"/ ; $page = $1 ; } } # sets POST value to number that was procession last foreach (@content) { if (/(.).*<\/td>/) { $cont_post = $1 ; } } last ; } } # All dhiraagu numbers start with one of these # seems useless to check all 0-9 # -- updated in 0.2 to include cont. functionality # my @numbers ; if ($cont == 1) { if ($cont_post == 3) { @numbers = qw { 3 6 7 } ; } elsif ($cont_post == 6) { @numbers = qw { 6 7 } ; } elsif ($cont_post == 7) { @numbers = qw { 7 } ; } } else { @numbers = qw { 3 6 7 } ; } foreach (@numbers) { my $post = $_ ; my $lp_control = 0 ; # Variable used to drop out of the while loop... # tells to continue from prev. page and prints status if ($cont == 1) { $post = $cont_post ; print "Continuing from Number: $post and Page: $page\n" ; $cont = 0 ; } select STDOUT ; print "Fetching entries starting with $post: \n"; # Fetch each page 1 by 1 starting with number # while ($lp_control != 1) { my $browser = LWP::UserAgent->new; $browser->default_headers->push_header('Accept' => "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"); my $URL = "http://www.dhivehinet.net.mv/index.php?action=edir&page=$page"; my $response = $browser->post( $URL, [ 'query' => "$post", ] ); my $content = $response->decoded_content ; my $file = '.edir.tmp' ; open EDIR, "> $file" or die "Could not open '$file': $!\n" ; select EDIR ; print $content ; select STDOUT ; close EDIR ; open EDIR, $file or die "Could not open '$file': $!\n" ; my @content = ; close EDIR ; foreach (@content) { if (/nextPage/) { m/index\.php\?action=edir&page=(\d*)"/ ; $page = $1 ; } elsif (/You are currently viewing page (\d*) of (\d*)/) { if ($1 == $2) { #print $1 ; print $2 ; $lp_control = 1 ; $page = 0 ; #die ; } } } # Trim Out useless html and format DATA # my $mark = 0 ; foreach (@content) { if ((!/Directory Entry/) && ($mark == 0)) { s/.*\n// ; } else { if (!$mark == 1) { $mark = 1 ; } if (/Directory Entry/) { s/.*\n// ; } if (!/<\/td>/) { s/.*\n// ; } } } # Clears useless blank entries from @content array # foreach (@content) { if ($_ eq " ") { shift @content ; } } foreach (reverse @content) { if (!/
<\/td>/) { #s/.*// ; pop @content ; } else { pop @content ; last ; } } # Dump the Entries to a file (tab delimited) # chomp (my $out_file = `date +%D`) ; $out_file =~ s/\//_/g ; $out_file = "Edir-" . $out_file . ".txt" ; open OUT, ">> $out_file" or die "Could not open '$out_file': $!\n" ; select OUT ; foreach (@content) { if ((/.*(.+)<\/td>/) && (!/http/)) { # print "$page\t" ; # used to debug print "$1\t" ; # name } if ((/.*(.+)<\/td>/) && (!/http/)) { print "$1\n" ; # number } } select STDOUT ; # print ".$page" ; print "." ; select OUT ; } } select STDOUT ; close OUT ; unlink '.edir.tmp' ; print "\n" ; print "Edir: Successfully Completed\n" ;