package require http package require tdom package require tk bind all <Key-Return> {imdb $::query} proc imdb {input} { set ua "Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 OpenSSL/0.9.7e" http::config -useragent $ua # don't manually build query strings http can do it better set query [http::formatQuery btnI "I'm Feeling Lucky" q "site:imdb.com $input"] set http [http::geturl http://www.google.com/search?$query] # get HTTP header info upvar 0 $http arr array set meta $arr(meta) # redirection url set url $meta(Location) set ::url $url .f1.bi configure -state normal # cleanup http::cleanup $http set http [http::geturl $url] set html [http::data $http] http::cleanup $http set doc [dom parse -html $html] set ::title [[$doc selectNodes {/html/head/title[1]}] asText] set outline [$doc selectNodes {string(//div[h5="Plot Summary:"]/text())}] if {$outline eq {}} { set outline [$doc selectNodes {string(//div[h5="Plot Outline:"]/text())}] } set outline [string trim $outline] .s delete 1.0 end .s insert 1.0 $outline set ::rating [[$doc selectNodes {//div[@class="general rating"]/b[2]} ] asText] set votes [[$doc selectNodes {//div[@class="general rating"]/small/a[1]} ] asText] $doc delete set ::votes [string map {, {}} [lindex [split $votes] 0]] } proc go {url} { exec cmd /c start {} $url & } label .lq -text Query: entry .q -textvariable query label .lt -text Title: entry .t -textvariable title label .lr -text Rating: entry .r -textvariable rating label .lv -text Votes entry .v -textvariable votes label .ls -text Outline text .s -height 3 -wrap word label .lu -text URL entry .u -textvariable url grid .lq .q -sticky ew grid .lt .t -sticky ew grid .lr .r -sticky ew grid .lv .v -sticky ew grid .ls .s -sticky ew grid .lu .u -sticky ew frame .f1 grid .f1 -column 1 -sticky ew button .f1.bs -text Search -command {imdb $::query} button .f1.bi -text "Goto IMDB" -state disable -command {go $::url} grid .f1.bs .f1.bi grid columnconfigure . 1 -weight 1 grid columnconfigure .f1 {0 1} -weight 1
LES on 20080821: This is a nice idea, but this application doesn't quite work for me. First, Google doesn't like Lynx so we have to tell it a different lie about the user agent. Second, the code above tries to obtain from the headers information that my tests never retrieved: $meta(Location). So I skipped that altogether. Third, I added a switch that will let you open the IMDB page in your default browser on Linux too if you have KDE (not necessarily using it as window manager). Fourth, dom chokes on the HTML, so I did what countless developers never hesitate to frown upon: replace dom parsing with Regular Expressions. Why? Because it works, that's why.LV I don't know about this specific application - but I use lynx with google and imdb on a regular basis and have not encountered cases where "google doesn't like lynx" myself.LES Using Lynx as the user agent in this application, the HTML is not the expected Google page. It is another page that says that my browser is not authorized, I should read the usage terms, etc. or something to that effect.MJ - Very strange, both versions work correctly for me. The meta(Location) part is because 'I'm feeling lucky' use a http 302 redirect, which the http package doesn't automatically follow. I have no clue why this doesn't work for you though, very strange indeed. Regarding the use of regexp instead of tdom, 'parsing' with regular expression will break as well, only differently. I guess the main conclusion that can be drawn is that webscraping like this is a hit and miss game. The only way to use a service like this robustly is by a webservice, which has other issues (SOAP does not make me feel clean).
LES Here is my code:
package require http package require Tk bind all <Key-Return> {imdb $::query} proc imdb {input} { http::config -useragent "Firefox 2.0.2" set query [http::formatQuery btnG Search q "site:imdb.com $input"] set http [http::geturl http://www.google.com/search?$query] set html [http::data $http] http::cleanup $http regexp {<div class=g><a href=['"]?([^<>'" ]+imdb.com/title[^<>'" ]+)} $html => ::url .f1.bi configure -state normal set http [http::geturl $::url] set html [http::data $http] http::cleanup $http regexp {<b>User Rating:</b>(.*?)</div>} $html => _rating regsub -all {(<[^<>]+>|more|\n)} $_rating {} _rating set ::rating [string trim $_rating] regexp {<h5>Genre:</h5>(.*?)</div>} $html => _genre regsub -all {(<[^<>]+>|more)} $_genre {} _genre set ::genre [string trim $_genre] regexp {<h5>Plot Outline:</h5>(.*?)<} $html => _plot set ::plot [string trim $_plot] .s delete 1.0 end .s insert 1.0 "$::genre\n$::plot" } proc go {url} { if { $::tcl_platform(platform) == "windows" } { exec cmd /c start {} $url & } else { exec kfmclient exec $url } } label .lq -text Query: entry .q -textvariable query label .lt -text Title: entry .t -textvariable title label .lr -text Rating: entry .r -textvariable rating label .ls -text Outline text .s -height 5 -wrap word label .lu -text URL entry .u -textvariable ::url grid .lq .q -sticky ew grid .lt .t -sticky ew grid .lr .r -sticky ew grid .ls .s -sticky ew grid .lu .u -sticky ew frame .f1 grid .f1 -column 1 -sticky ew button .f1.bs -text Search -command {imdb $::query} button .f1.bi -text "Goto IMDB" -state disable -command {go $::url} grid .f1.bs .f1.bi grid columnconfigure . 1 -weight 1 grid columnconfigure .f1 {0 1} -weight 1
See also: Web scraping