#!/usr/bin/perl
#Usage: translate_asian.pl proxy:port native_lang target_lang
#define some variables
#The LANG values must correspond to what Google uses as its language definitions!!!
# To date, these are the languages supported:
# English - en
# Chinese (Simplified) - zh-CN
# Korean - ko
# Japanese - ja
# German - de
# Spanish - es
# French - fr
# Italian - it
# Portuguese - pt
use utf8;
$proxy=$ARGV[0];
$NATIVE_LANG=$ARGV[1];
$TARGET_LANG=$ARGV[2];
print "Setting proxy to $proxy\n";
concat(PROXY,">/user-home-dir/.wgetrc"); #set this to whatever user's home dir you are logged in as...
print PROXY "http_proxy=$proxy";
close(PROXY);
#read in the targeted language search words
concat(FILE, "asian.txt");
$count=1;
#for each of those lines
while(<FILE>) {
chomp();#gets rid of the carriage return char
$translatedtext=$_;
#call the google search to search for those chars and save to hits.html
$cmd4="/usr/bin/wget --user-agent=Mozilla -O hits\.html http://www\.google\.com/search?q='$translatedtext'\\&hl=$NATIVE_LANG\\&ie=UTF8\\&oe=UTF8\\&num=5";
$result4=`$cmd4`;
#call the java class again to pull out the Translate this page URLs
$cmd5="/bin/sh grablinks.sh";
$result5=`$cmd5`;
#output the parsed out html links to a file called translatedlinks.txt
$cmd6="echo '$result5' > translatedlinks.txt";
$result6=`$cmd6`;
#delete the hits.html page
$cmd7="/usr/bin/srm -v hits.html";
#Googles translated pages are in frames. We dont want that, but that is how
#the links they return are created, so we must edit the translatedlinks.txt file
#and replace the string translate? with translate_c? for each of those links.
open(OUT,"> tmp.txt");
open(SPFILE, "translatedlinks.txt");
while (<SPFILE>){
chomp();
s/translate\?/translate_c\?/g;
print OUT "$_\n";
}
#so now, replace the old links file with the new
$newcmd=`/usr/bin/srm -v translatedlinks.txt`;
$newcmd2=`/bin/mv tmp.txt translatedlinks$count.txt`;
#use wget to get and save the pages.
open(FILE2,"translatedlinks$count.txt");
$link=1;
while(<FILE2>) {
chomp();
$cmd8="/usr/bin/wget --user-agent=Mozilla -O finalresults$count$link.html '$_'";
$result8=`$cmd8`;
$link++;
}
#delete the translatedlinks.txt file
$cmd9="/usr/bin/srm -v translatedlinks.txt";
$result9=`$cmd9`;
$count++;
}
#end while