#!/usr/bin/perl -w
# Author:   $Author: merkosh $
# Revision: $Rev: 57 $
############################################################################
#    Copyright (C) 2005 by Uwe Mayer                                       #
#    merkosh@hadiko.de                                                     #
#                                                                          #
#    This program is free software; you can redistribute it and/or modify  #
#    it under the terms of the GNU General Public License as published by  #
#    the Free Software Foundation; either version 2 of the License, or     #
#    (at your option) any later version.                                   #
#                                                                          #
#    This program is distributed in the hope that it will be useful,       #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of        #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         #
#    GNU General Public License for more details.                          #
#                                                                          #
#    You should have received a copy of the GNU General Public License     #
#    along with this program; if not, write to the                         #
#    Free Software Foundation, Inc.,                                       #
#    59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             #
############################################################################

#-----------------------------------------------------------#
# scan Amazon for a number of fields on a movie title       #
#-----------------------------------------------------------#

#-- imports --------------------------------------------------------------------
use URI::Escape;
use LWP::UserAgent;
use HTTP::Request;
use HTML::Entities;
use File::Basename;
use HTML::TreeBuilder;
use Math::Round;

use LMCTools;


#-- display help screen --------------------------------------------------------
if ((grep /--help|-h/,@ARGV) || (scalar @ARGV == 0)) {
  print STDERR <<HELP;
amazon-en.pl  \$Rev\$  (c)  2005-01-04  by Uwe Mayer

Search Amazon.com for picture on a movie title.

Synopsis: IMDB-en.pl [-h|--help] <title>|<URL>

     -h     --help     this screen
     <title>           search for <title> on imdb and return either the
                       the information or a list of matches
     <URL>             get information from this URL

The URL is distinguished from the title by the prefix \'http://\'. If
your title happens to have this prefix you\'re busted. ;)

If your internet connection needs a proxy server set the environment
variable "http_proxy" to the appropriate url 
(i.e. http_proxy=http://proxy.somehost.com:8080)

HELP
  exit();

}


#-- parse arguments ------------------------------------------------------------
my $title = "";
my $URL = "";

if (substr($ARGV[0], 0,7) eq 'http://') {
  $URL = $ARGV[0];
} else {
  $title = $ARGV[0];
}


#-- scan amazon ----------------------------------------------------------------
$baseURL = 'http://www.amazon.com/';
$searchURL = 'http://www.amazon.com/exec/obidos/search-handle-url/index=dvd&field-title=';


#-- title ----------------------------------------------------------------------
if ($title) {
  # default variables
  $section = "Amazon.com Pictures:";
  %tlinks = ();
  @match = ();
  $pageCount = 1;

  # get list of titles
  $page = getPage($searchURL.uri_escape($title))->content();

  # try to get number of hits to calculate number of consecutive pages
  if ($page =~ /All (\d+) results for/) {
    $pageCount = round($1 /10);	       # ten hits per page
    $pageCount++ if ($1 % 10 < 5);     # no ceil function in perl (?!)
  }

  # process all pages
  for ($pg=1; $pg <= $pageCount; $pg++){
    # search for links which look like our target
    $tree = HTML::TreeBuilder->new();
    $tree->parse($page);
    @links = @{$tree->extract_links()};

    # filter target links
    foreach (@links){
      if ((substr($_->[0], 0, 23) eq "/exec/obidos/tg/detail/") && # URL matches this prefix
	  ((ref @{$_->[1]->content()}[0] ne "HTML::Element") &&    # link is clear text description
	   (@{$_->[1]->content()}[0] !~ /buy|in\-store/i)          # link is text and does not contain "buy" and "in-store"
	   ) 
	  ) {

	$tlinks{$_->[0]} = $_->[1];		  # target link found
      }
    }
    
    # download next page
    if ($pg < $pageCount) {
      $page = getPage($searchURL.uri_escape($title)."&pg=".($pg+1))->content();
    }
  }

  # scan for url and text
  foreach (keys %tlinks){
    push @match, {Text => $tlinks{$_}->content()->[0],
		  URL => $tlinks{$_}->attr('href')}
  }


  #-- output Results
  print "status: list\n";
  print "section: $section\n";

  foreach $rec (@match){
    print "title: $rec->{Text}\n";
    print "url: $baseURL$rec->{URL}\n";
    print "\n";
  }
  exit;
}


#-- URL ------------------------------------------------------------------------
if ($URL){
  #-- get main page
  $response = getPage($URL);
  $page = $response->content();
  # when querying for a list and recieving details the url
  # of the page has changed: update this
  $URL = $response->base();

  # fields we are looking out for:
  %data = ();
  # - picture
  
  # the following are not available
  # - originalTitle
  # - year
  # - director
  # - category
  # - actors
  # - rating
  # - url
  # - length
  # - country
  # - language
  # - description
  # - comments
  # - translated title
  # - producer

  # picture
  if ($URL !~ /detail\/\-\/(.+?)\?/) {
    print "status: error\n";
    print STDERR "url: $URL\n";
    print STDERR "Could not extract picture ID from URL\n";
  }
  $ID = $1;

  if ($page =~ /\"(http\:\/\/images\.amazon\.com\/images\/.+?\/$ID\..+?)\"/){
    $data{picture} = $1;
  }

  #-- output
  print "status: details\n";

  foreach $k (keys(%data)){
    if ($k =~ /description|actors/){
      foreach (@{$data{$k}}){
	print "$k: $_\n";
      }
    }
    else {
      print "$k: $data{$k}\n";
    }
    print "\n";
  }


}
