#!/usr/bin/perl -wT # TODO # 1. Check nothing nasty being uploaded in file upload # 2. Need to do lots of error handling - at the moment almost none :( # 3. Write a nicer HTML submission form # 4. Write a nicer HTML results sheet # 5. Lookup title based on ISBN and keep record of which titles are used to recommend which course # 6. Look into creating @isbns from Amazon purchase history # 7. Look into creating @isbns from RefWorks RefShare RSS feed # 8. Look into creating @isbns from OpenSearch file # 9. Look into creating @isbns from LibraryThing account # 10. Look into recording locations for all possible institutions and map results # 11. Check uploaded file ISBNs are actually ISBNs # 12. Use LibraryThing API to check ISBNs are valid # 13. For each ISBN in the upload file, retrieve other ISBNS from ISBNx or ThingISBN and include them in the file. Need to think about this, don't want to duplicate queries against API - does the data file already contain equivalence? use strict; use XML::Simple qw(:strict); use LWP::UserAgent; use Data::Dumper; use CGI; use CGI::Carp qw ( fatalsToBrowser ); use File::Basename; use HTML::Parser; #Limit file upload to 1Mb $CGI::POST_MAX = 1024 * 1000; # Initialise variables my $debug = 0; my $response_xml; my $match; my $isbn; my $course_code; my $result_html; my $safe_filename_characters = "a-zA-Z0-9_.-"; my $stateID; my $course_url; my $course_valid; my $course_results; my $course_details; my $courses_printed = 0; my $ucas_base_url = "http://search.ucas.com"; my @course_inst_list; # Initialise Hashrefs my $isbns = {}; my $related_courses_set = {}; #Declare a LWP UserAgent we can use my $ua = LWP::UserAgent->new; $ua->agent("MOSAIC_ReadToStudy/0.8"); #Call subroutine to load ISBNs $isbns = load_isbns(); #Call subroutine to find courses related to isbns, one by one $related_courses_set = fetch_related_courses_item($isbns); #Call subroutine to find each course on the UCAS website $stateID = get_stateID(); # Print out the related_courses_set and isbns hashes $result_html = new CGI; print $result_html->header(); print $result_html->start_html( -title=>'Course recommendations', -style=>{'src'=>'recommender.css'} ); #Sort related_courses_set hash by counts foreach $course_code ( sort {$related_courses_set->{$b}->{'count'} <=> $related_courses_set->{$a}->{'count'}} keys %$related_courses_set ) { $course_url = 'http://search.ucas.com/cgi-bin/hsrun/search/search/StateId/'.$stateID.'/HAHTpage/search.HsCodeSearch.submitForm?cmbInst=&txtJacsCode='.$course_code; $course_results = get_html($course_url); $course_valid = parse_course_list($course_results); if (!$course_valid) { next; } if ( $match eq "close" && ( course_loans($course_code)/100 > $related_courses_set->{$course_code}->{'count'} || !course_loans($course_code) ) ) { next; } print "

The following courses correspond to ".uc($related_courses_set->{$course_code}->{'type'})." code ".$course_code.". They have been recommended because of the following books in your collection:

"; $course_details = parse_ucas_results($course_results); print "
"; $courses_printed += 1; } #print Dumper($related_courses_set); #print Dumper($isbns); if ($courses_printed == 0) { print "

I'm afraid that we have not matched any of the ISBNs in your list. If you used the 'close match only' option you could try again and specify the 'all matches' option, or you could try again with a different list, or with one of the sample lists at the Read to Learn Homepage.

"; } print $result_html->end_html; sub load_isbns { # Declare local variables my $isbns = {}; my $query = new CGI; $match = $query->param("match"); if ($query->param("test") == 1) { $isbns->{'0722177755'} = 'Untried'; $isbns->{'0722177763'} = 'Untried'; $isbns->{'0552770884'} = 'Untried'; $isbns->{'043999358X'} = 'Untried'; $isbns->{'0070185662'} = 'Untried'; $isbns->{'0003271323'} = 'Untried'; $isbns->{'0003271331'} = 'Untried'; $isbns->{'0003272788'} = 'Untried'; return $isbns; } my $filename = $query->upload("isbns"); $match = $query->param("match"); if ( !$filename ) { print $query->header ( ); print "There was a problem uploading your file of ISBNs (upload is limited to 1Mb so you might want to check that)."; exit; } my ( $name, $path, $extension ) = fileparse ( $filename, '\..*' ); $filename = $name . $extension; $filename =~ tr/ /_/; $filename =~ s/[^$safe_filename_characters]//g; if ( $filename =~ /^([$safe_filename_characters]+)$/ ) { $filename = $1; } else { die "Filename contains invalid characters"; } my $isbns_filehandle = $query->upload("isbns"); my @isbns = <$isbns_filehandle>; foreach (@isbns) { # Remove newline characters, and then whitespace and hyphens chomp($_); $_ =~ s/\s+//g; $_ =~ s/-//g; # Check that what is left looks like a 10 or 13 char ISBN if ($_ =~ m/^(\d{9}(\d{3})?[\d|X|x])$/ ) { $isbns->{$_} = 'Untried'; } } return $isbns; } sub fetch_related_courses_item { # Declare local variables my $related_courses_item = {}; my $related_courses_set = {}; my $api_url; my $response_xml; #Get hashref that has been passed to subroutine and assign to local hashref my $isbns = $_[0]; foreach my $isbn_query (keys %$isbns) { #Retrieve data via API and assign response $api_url = api_request($isbn_query,'','summary','','',''); $response_xml = get_html($api_url); #Check that XML was retrieved OK and if not skip to the next ISBN if ($response_xml =~ m/ReadToStudy get HTML failed/i) { $isbns->{$isbn_query} = $response_xml; next; } #Check that XML doesn't contain an error tag and if so write error and skip to the next ISBN if ($response_xml =~ m/(.*)<\/error>/i) { $isbns->{$isbn_query} = $1; next; } # Parse reponse, forcing the loansPerCourseCode attributes and value into a hash # using courseCode as a key. my $response_parsed = XMLin($response_xml, forcearray => [ qw(loansPerCourseCode) ], keyattr => ['loansPerCourseCode' => 'courseCode'] ); # Double check that the contents does not contain an error. # Write ISBN back to hash with a status message of NOTFOUND or similar my $response_error = $response_parsed->{error}; if ($response_error) { $isbns->{$isbn_query}='Not Found:'.$response_error; next; } # Put the resulting hash into a hashref $isbns->{$isbn_query}='Found'; $related_courses_item = ($response_parsed->{summary}->{loansPerCourseCode}); # Add an extra 'count' element to the array foreach $course_code (keys %$related_courses_item) { if (exists $related_courses_set->{$course_code}) { $related_courses_set->{$course_code}->{'count'}++; $related_courses_set->{$course_code}->{'content'} += $related_courses_item->{$course_code}->{'content'}; $related_courses_set->{$course_code}->{'isbns'} .= "
  • ".$isbn_query."
  • "; } else { $related_courses_set->{$course_code} = { count => 1, content => $related_courses_item->{$course_code}->{'content'}, type => $related_courses_item->{$course_code}->{'type'}, isbns => "
  • ".$isbn_query."
  • " }; } } } return $related_courses_set; } sub api_request { my ($isbn, $ucas, $show, $prog, $years, $rows) = @_; my $api_url = 'http://library.hud.ac.uk/mosaic/api.pl?isbn='.$isbn.'&ucas='.$ucas.'&show='.$show.'&prog='.$prog.'&years='.$years.'&rows='.$rows; } sub get_stateID { my $course_search_url = 'http://search.ucas.com/cgi-bin/hsrun/search/search/search.hjx;start=search.HsCodeSearch.run?y=2010'; my $response_html = get_html($course_search_url); if ($response_html =~ m/StateID\/(\S{29}-\S{4})\//i) { return $1; } return "Failed to find StateID"; } sub parse_course_list { my $course_list = $_[0];; if ($course_list =~ m/No courses found, please try again./i) { #Need to remove course from hash of courses and not output in html return 0; } #Need to parse out all course links and institutions from results and include in display somehow return 1; } sub course_loans { my $course_code = $_[0]; my $course_url = api_request('',$course_code,'summary','','',''); my $response_xml = get_html($course_url); #Check that XML was retrieved OK and if not return if ($response_xml =~ m/ReadToStudy get HTML failed/i) { return; } #Check if XML contains an error tag and if so return if ($response_xml =~ m/.*<\/error>/i) { return; } my $count_matches = ($response_xml =~ s/loansPerGlobalID /loansPerGlobalID /g); return $count_matches; } sub get_html { my $url =$_[0]; my $response; # Create a request my $req = HTTP::Request->new(GET => $url); my $res = $ua->request($req); # Check if successful or not if ($res->is_success) { $response = $res->content; return $response; } return "ReadToStudy get HTML failed: ".$url." : ".$res->status_line; } sub parse_ucas_results { my $p = HTML::Parser->new(api_version => 3, start_h => [\&a_start_handler, "self,tagname,attr"], report_tags => [qw(a)], ); $p->parse(shift || die) || die $!; } sub a_start_handler { my($self, $tag, $attr) = @_; return unless $tag eq "a"; return unless exists $attr->{href}; return unless $attr->{href} =~ m/.*\?[n|i]\=.*/i; if ($attr->{href} =~ m/.*\?n\=.*/i) { print "