#!/usr/bin/perl #Usage: translate_asian.pl proxy:port native_lang target_lang #define some variables #The LANG values must correspond to what Google uses as its language definitions!!! # To date, these are the languages supported: # English - en # Chinese (Simplified) - zh-CN # Korean - ko # Japanese - ja # German - de # Spanish - es # French - fr # Italian - it # Portuguese - pt use utf8; $proxy=$ARGV[0]; $NATIVE_LANG=$ARGV[1]; $TARGET_LANG=$ARGV[2]; print "Setting proxy to $proxy\n"; concat(PROXY,">/user-home-dir/.wgetrc"); #set this to whatever user's home dir you are logged in as... print PROXY "http_proxy=$proxy"; close(PROXY); #read in the targeted language search words concat(FILE, "asian.txt"); $count=1; #for each of those lines while(<FILE>) { chomp();#gets rid of the carriage return char $translatedtext=$_; #call the google search to search for those chars and save to hits.html $cmd4="/usr/bin/wget --user-agent=Mozilla -O hits\.html http://www\.google\.com/search?q='$translatedtext'\\&hl=$NATIVE_LANG\\&ie=UTF8\\&oe=UTF8\\&num=5"; $result4=`$cmd4`; #call the java class again to pull out the Translate this page URLs $cmd5="/bin/sh grablinks.sh"; $result5=`$cmd5`; #output the parsed out html links to a file called translatedlinks.txt $cmd6="echo '$result5' > translatedlinks.txt"; $result6=`$cmd6`; #delete the hits.html page $cmd7="/usr/bin/srm -v hits.html"; #Googles translated pages are in frames. We dont want that, but that is how #the links they return are created, so we must edit the translatedlinks.txt file #and replace the string translate? with translate_c? for each of those links. open(OUT,"> tmp.txt"); open(SPFILE, "translatedlinks.txt"); while (<SPFILE>){ chomp(); s/translate\?/translate_c\?/g; print OUT "$_\n"; } #so now, replace the old links file with the new $newcmd=`/usr/bin/srm -v translatedlinks.txt`; $newcmd2=`/bin/mv tmp.txt translatedlinks$count.txt`; #use wget to get and save the pages. open(FILE2,"translatedlinks$count.txt"); $link=1; while(<FILE2>) { chomp(); $cmd8="/usr/bin/wget --user-agent=Mozilla -O finalresults$count$link.html '$_'"; $result8=`$cmd8`; $link++; } #delete the translatedlinks.txt file $cmd9="/usr/bin/srm -v translatedlinks.txt"; $result9=`$cmd9`; $count++; } #end while