///////
   //    HtmlParser.cc
   //    HtmlParser Class definitions
   //
   //    Class for parsing HTML documents
   //
   //    Copyright (c) 1999-2004 Comune di Prato - Prato - Italy
   //    Author: Gabriele Bartolini - Prato - Italy <angusgb@users.sourceforge.net>
   //
   //    For copyright details, see the file COPYING in your distribution
   //    or the GNU General Public License version 2 or later 
   //    <http://www.gnu.org/copyleft/gpl.html>
   //
   //    $Id: HtmlParser.cc,v 1.73 2004/05/04 14:14:37 angusgb Exp $
   //
   //    G.Bartolini
   //    started: 30.03.2000
///////

#include <ctype.h>   // for isspace()

#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */

#include "HtmlParser.h"
#include "HtSGMLCodec.h"
#include "Configuration.h" // for META attributes parsing

   // Static variables initialization
      int HtmlParser::debug = 0;

// This define the maximum number of characters present in an HTML tag
// between the starting '<' and the closing '>'.
#define MAX_TAG_SIZE 4096

// Types of tags: start, end, empty
#define TAGstart    0x0001
#define TAGend      0x0002
#define TAGempty    0x0004

// Location in the document
#define TAGhead     0x0001 // The <HEAD> tag is open
#define TAGtitle    0x0002 // The <TITLE> tag is open
#define TAGlink     0x0004 // The <A> tag is open
#define TAGscript   0x0008 // if a <SCRIPT> tag is open, it's true
#define TAGimg      0x0010 // The <IMG> tag is open
#define TAGhx       0x0020 // Current Tag: <Hx>
#define TAGb        0x0040 // Current Tag: <B>
#define TAGi        0x0080 // Current Tag: <i>
#define TAGblink    0x0100 // Current Tag: <blink>
#define TAGinput    0x0200 // Current Tag: <input>

// Accessibility info (ACHECK - accessibility check) for documents
#define ACHECKDOCtitle   0x0001 // The document title is present

// Accessibility info (ACHECK - accessibility check) for tags
#define ACHECKTAGalt   0x0001 // The ALTernative has been specified
#define ACHECKTAGinputimg 0x0002 // The INPUT is an image

// ALT text checks
#define ALTempty		0x0001	// Empty ALT
#define ALTsameasfile	0x0002	// Same name as file
#define ALTlong			0x0004	// ALT too long

//*****************************************************************************
// void HtmlParser::encodeURL(String &str, char *valid)
//   Convert a normal string to a URL 'safe' string.  This means that
//   all characters not explicitly mentioned in the URL BNF will be
//   escaped.  The escape character is '%' and is followed by 2 hex
//   digits representing the octet.
//
void HtmlParser::encodeURL(String &str, char *valid)
{
    String      temp;
    static char *digits = "0123456789ABCDEF";
    char        *p;

    for (p = str; p && *p; p++)
    {
        if (isascii(*p) && (isdigit(*p) || isalpha(*p) || strchr(valid, *p)))
            temp << *p;
        else
        {
            temp << '%';
            temp << digits[(*p >> 4) & 0x0f];
            temp << digits[*p & 0x0f];
        }
    }
    str = temp;
}

// Default constructor
HtmlParser::HtmlParser()
: CurrentScheduler(0), BaseUrl(0), Charset(0), DocType(0), LinkDescription(0),
    Description(0), Keywords(0), DocLanguage(0), CurrentHx(0), PreviousHx(0),
	CurrentAltText(0), CurrentResourceRef(0), AltAttrPosition(0),
	ignore(false), memo(true), tag_type(0), location(0), doc_acheck(0),
	store_statement(true)
{
}

// Destructor
HtmlParser::~HtmlParser()
{
   if (BaseUrl && BaseUrl != CurrentScheduler->CurrentUrl)
      delete BaseUrl; // Base Url different from CurrentUrl. So delete it.
}


// Operator overloading () -> makes this function a function object.
// This is used by the Scheduler object in order to parse a 
// document (previously retrieved)

HtmlParser::HtmlParser_Codes HtmlParser::operator() (Scheduler &scheduler)
{

   // Initialization
   CurrentScheduler = &scheduler;
   location = 0;
   ignore = false;
   memo = true;
   tag_type = 0;
   doc_acheck = 0;


   // HTML Title of the document
   String Title(0);
   
   // Set debug Level
   SetDebugLevel(CurrentScheduler->GetDebugLevel());
   
   // Contents of the document
   String Contents(CurrentScheduler->CurrentResponse->GetContents()); 

   // position is set to the beginning of the retrieved document contents
   position = (unsigned char *) Contents.get();
   
   // Initialize the tag position index
   TagPosition = 0;
   LastLinkTagPosition = 0;

   // Initialize the row number
   row = 1;

   // Initialize the charset string
   Charset.trunc();

   // Initialize the doctype string
   DocType.trunc();

   // Initialize the description string
   Description.trunc();

   // Initialize the keywords string
   Keywords.trunc();

   // Initialize the document language string
   DocLanguage.trunc();

   // Initialise the current and previous header information
   PreviousHx = 0;
   CurrentHx = 0;

   // Initialize the current ALT text
   CurrentAltText.trunc();

   // Initialize the resource reference
   CurrentResourceRef.trunc();

   // Attribute position for ALT (inside the tag)
   AltAttrPosition = 0;

   // Assign the base URL used for resolving relative paths
   BaseUrl = CurrentScheduler->CurrentUrl;

   // Let's start parsing the HTML document, from the beginning
   while (*position)
   {

      // Let's check for a comment or a possible DTD declaration

      if (strncmp((char *)position, "<!", 2) == 0)
      {
         position +=2;
         if (strncmp((char *)position, "--", 2) == 0)
         {
            position += 2;
            // Yes ... it is a comment - Go to its end
            do    // Loop until we find a '>' preceded by 2 '-' at least
            {

               int cons_dashes = 0; // Counter for consecutive dashes
               for (ppos = position; *ppos && (cons_dashes < 2); ++ppos)
               {
                  if (*ppos == '-')
                     ++cons_dashes;
                  else
                  {
                     cons_dashes = 0;
                     if (*ppos == (unsigned char) 10)
                        ++row;
                  }
               }

               if (cons_dashes < 2)
               {
                  *position ='\0';
                  break;
               }
               else
               {
               
                  // Here we are after a a '--'
                  position = ppos;
               
                  // Skip extra dashes after a badly formed comment
                  while (*position == '-')
                     ++position;

                  // Skip whitespace
                  while (isspace(*position))
			 	  {
                     if (*position == (unsigned char) 10)
                        ++row;

                     ++position;
                  }
               }
            } while (*position && *position != '>');
            
            if (*position == '>')
               ++position; // End of comment
         }
         else
         {
            // It's not a comment declaration but could be a DTD declaration
            for (ptext = text; *position && *position != '>'; ++position)
            {
               if (*position == (unsigned char) 10)
                  ++row;
			   else
					*ptext++ = *position;
            }
			*ptext = '\0';

			if (!mystrncasecmp((const char *)text, "doctype", 7))
			{
					for (ptext = text + 7; *ptext && isspace(*ptext); ++ptext); // Skip any whitespace
					DocType = (const char *) ptext; // Assign the DocType to the parser variable
			}

            if (*position)
               ++position; // Found the end. Let's skip the char
         }
         
         continue;
         
      }
      
      if (*position =='<')
      {
         ++position; // skip the initial '<'
         
         // Now ... something strange may appear. Let's think of
         // a malformed HTML document, in which the writer puts
         // a '<' symbol instead of a '&lt;' sgml entity.
         // Let's try to catch it, even if it is very difficult;
         
         // Do we have a valid character after the '<'?
         while (isspace(*position))
         {
            if (*position == (unsigned char) 10)
               ++row;

            ++position;
         }

         // Maybe it wasn't a valid tag
         // If we are here we may assume we have a valid character,
         // after '<', so an alpha char, or a '/' for closing tags.
         // But we can also have something like:
         // <B.%2  -- Don't ask me why, but somebody got it!!!
         
         // Another check to perform is if we find a not alphabetic
         // character before a space or a closing tag.

         bool not_good = false;         
         for (ppos = position; !not_good && *ppos && !isspace(*ppos)
            && *ppos != '>'; ++ppos)
         {
            // cout << *ppos << endl;
            if (!isalnum(*ppos) && *ppos!='/')
               not_good = true;
         }
         
         // We found a not valid characther before a space! Skip this tag.
         if (not_good)
			   continue;
                  
         // Start of a tag. Let's search for the closing '>'
         // But we can also have it after the previous loop
         if (*ppos && *ppos != '>')
            ppos = (unsigned char *) strchr((char *)position, '>');
         
         if (ppos)
         {

            // Another trick to catch a malformed tag declaration
            // that is to say a missing '&lt;', let's check if
            // the tag size is bigger than a fixed size (MAX_TAG_SIZE)
            
            if ((int) (ppos - position) > MAX_TAG_SIZE)
               continue;
               
             // Temporary bookmark for the end of the tag
             unsigned char* pend = ppos;

             // Skip any white space at the end
            for (--ppos; *ppos && isspace(*ppos); --ppos);

            // Found. Let's copy it, by skipping '<' and '>'
            ptext=text;

      	    // copy the characters from the source to the destination            
            while (position <= ppos)
            {
//               cout << (int) (ppos - position) << " _  " << (int) position
//                  << " _  " << (int) ppos << ": " << *position << endl;
               *ptext++ = *position++;
            }

            *ptext='\0';   // close the string
            position = pend + 1;    // Skip the closing '>'

            ++TagPosition;

            // Let's parse the tag by using the member attribute 'text'
      	    // and then Status of the parser
            switch(ParseTag())
            {
               case HtmlParser_NullTag:
                  if (debug > 1)
                     cout << "Warning! Empty (NULL) tag: "
                        << htmlstatement << " - " << text << endl;
                  break;

               case HtmlParser_TagNotStored:
                  if (debug > 3)
                     cout << "Tag not stored: "
                        << htmlstatement << " - " << text << endl;
                  break;

               case HtmlParser_MalformedTag:
                  if (debug > 0)
                     cout << "Warning! Malformed tag: "
                     << htmlstatement << " - " << text << endl;
                  break;

               case HtmlParser_StatementFailed:
                  if (debug > 0)
                     cout << "Error! Insert of HTML statement failed: "
                        << htmlstatement << " - " << text << endl;
                  return HtmlParser_StatementFailed;
                  break;

               case HtmlParser_AttributeFailed:
                  if (debug > 0)
                     cout << "Error! Insert of HTML attribute failed: "
                        << htmlattribute << " - " << text << endl;
                  return HtmlParser_AttributeFailed;
                  break;

               case HtmlParser_LinkFailed:
                  if (debug > 0)
                     cout << "Error! Insert of this link failed: "
                        << link << " - " << text << endl;
                     return HtmlParser_AttributeFailed;
                  break;

               case HtmlParser_OK:  // Do nothing
               default:  // Do nothing
                  break;
            }
            
         }
         else
         {
            while (*position)
               ++position;  // reach the end (no more tags)
         }
      }
      else
      {
         // We are in the title. Let's store it
         if (location & TAGtitle)
            Title.append(*position);
	 else if (location & TAGlink)
	 {
            if (isspace(*position))
	    {
	       if (LinkDescription.length() > 0 && !isspace(LinkDescription[LinkDescription.length() -1]))
                  LinkDescription.append(' ');
	    }
	    else
               LinkDescription.append(*position);
	 }

         // If it is a newline we increment the row number 
         if (*position == (unsigned char) 10)
            ++row;

         ++position;
      }

   }

   CurrentScheduler->CurrentUrl->SetTitle(encodeSGML(Title));
   CurrentScheduler->CurrentUrl->SetCharset(Charset);
   CurrentScheduler->CurrentUrl->SetDocType(DocType);
   CurrentScheduler->CurrentUrl->SetDescription(Description);
   CurrentScheduler->CurrentUrl->SetKeywords(Keywords);

   // If Accessibility Checks are not enabled we exit
   if (!CurrentScheduler->Config->Boolean("accessibility_checks"))
      return HtmlParser_OK;

   // //////////////////////////////////////////////////////
   // Begin of accessibility checks (document level)
   // //////////////////////////////////////////////////////
   // Missing TITLE (Open Accessibility Check: Code 50)
   if (!(doc_acheck & ACHECKDOCtitle))
   {
      // The accessibility check needs to be inserted
      if (!InsertAccessibilityCheck(CurrentScheduler->CurrentUrl->GetID(), 0, 0, 50))
         return HtmlParser_AccessibilityCheckFailed; // Failed
   }
   else
   {
      // We have a title
      unsigned counter =  CountSGMLStringLength ((const char*)
         CurrentScheduler->CurrentUrl->GetTitle());

	  if (!counter)
	  {
         // The accessibility check needs to be inserted
         if (!InsertAccessibilityCheck(CurrentScheduler->CurrentUrl->GetID(), 0, 0, 51))
            return HtmlParser_AccessibilityCheckFailed; // Failed
	  }
	  else if (counter >= 150)
	  {
         // The accessibility check needs to be inserted
         if (!InsertAccessibilityCheck(CurrentScheduler->CurrentUrl->GetID(), 0, 0, 52))
            return HtmlParser_AccessibilityCheckFailed; // Failed
	  }

   }

   // Document language
   if (DocLanguage.length())
   {
      // Check for a valid value
   }
   else
   {
      // The accessibility check needs to be inserted
      if (!InsertAccessibilityCheck(CurrentScheduler->CurrentUrl->GetID(), 0, 0, 48))
         return HtmlParser_AccessibilityCheckFailed; // Failed
   }

   // //////////////////////////////////////////////////////
   // End of accessibility checks (document level)
   // //////////////////////////////////////////////////////
   return HtmlParser_OK;
   
}


HtmlParser::HtmlParser_Codes HtmlParser::ParseTag ()
{

   bool has_attributes = false;
   bool tag_stored = false;
   bool malformed_tag = false;
   tag_type = 0;
   int tag_acheck(0);
   CurrentHx = 0;
   // Reset all the not important tag info from the location
   location &= ~(TAGimg | TAGhx | TAGb | TAGi | TAGblink | TAGinput);
   // Initialize alternative text and resource reference strings
   CurrentAltText.trunc();
   CurrentResourceRef.trunc();
   AltAttrPosition = 0;
   
   // Temporary pointer
   register unsigned char *ptmp;

   // Statement
   register unsigned char *Statement = text;

   // Skip initial spaces
   while (*Statement && isspace(*Statement))
   {
      if (*Statement == (unsigned char) 10)
         ++row;
      ++Statement;
   }

   if (!*Statement)
      return HtmlParser_NullTag;   // Empty

   // Reset htmlstatement variable
   htmlstatement.Reset();
   
   // Set the IDUrl for the HtmlStatement object
   htmlstatement.SetIDUrl(CurrentScheduler->CurrentSchedule.GetIDSchedule());

   // Set the whole statement
   htmlstatement.SetStatement((char *)Statement);
   
   // Set the tag position
   htmlstatement.SetTagPosition(TagPosition);

   // Set the row number
   htmlstatement.SetRow(row);

   // Set the tag position of the last link (open link - 'A' element)
   htmlstatement.SetLinkTagPosition(LastLinkTagPosition);

   // Check if we have an empty tag
   if (Statement[strlen((const char*) Statement) - 1] == '/')
       tag_type |= TAGempty;

   ptmp=Statement;   // Stores the beginning of the tag

   while (*Statement && !isspace(*Statement))
      ++Statement;

   if (ptmp==Statement) // No tag !!!
      return HtmlParser_NullTag;

   if (*Statement)
   {
      if (*Statement == (unsigned char) 10)
           ++row;

      // Check for a tag with attributes
      *Statement='\0';

      if (debug>5)
         cout << "Tag found: " << ptmp << endl;

      // go on
      ++Statement;   
      
      // Skip everything but alphanum chars after the tag
      while (*Statement && !isalpha(*Statement))
      {
         if (*Statement == (unsigned char) 10)
            ++row;
         ++Statement;
      }
      
      if (*Statement)
         has_attributes = true; // The current tag has attributes

   }
   
   htmlstatement.SetTag((char *)ptmp);   

   // Determine the type of the tag (end, start)
   if (*ptmp == '/')
   {
       tag_type |= TAGend;
       ++ptmp; // skip the slash
   }
   else
       tag_type |= TAGstart;

   // We got the TAG info we need
   int old_location = location;
   if (! CheckTag((char *)ptmp))
      memo=false;    // Not store it
   else memo=true;

   // Should we insert a link description for the previos 'A' element?
   if (CurrentScheduler->Config->Boolean("store_link_info")
      && !(location & TAGlink) && (old_location & TAGlink) && LinkDescription.length() > 0)
   {
       if (LinkDescription.length() > 0 &&
          !CurrentScheduler->GetDB()->InsertHtmlStatementLinkDescription(htmlstatement.GetIDUrl(),
             LastLinkTagPosition, encodeSGML(LinkDescription)))
          return HtmlParser_StatementFailed; // Failed
	   LastLinkTagPosition = 0; // erase the position of the last tag with a link
   }

   if (ignore)
   {
      if (! (location & TAGscript))
      {
         // We just found a closing </SCRIPT> tag
         ignore = false;
         memo = true;
      }
      else memo = false;
   }
   else
   {
      if (location & TAGscript)    // We found a <SCRIPT> tag. We ignore the following tags
         ignore = true;
   }

   // We don't have to store it
   if (!memo)
      return HtmlParser_TagNotStored;

   if (has_attributes)
   {
      // Let's look for attributes
      // Starting point: Statement now points to the first attribute

      unsigned int AttrPosition = 0;

      while (*Statement)   // Until we reach the end look for attributes
      {
         ptmp = Statement;

      // Look for an attribute definition
      // Goes on until we reach:
      // 1) the end or until a whitespace not follwed by '=' (empty attribute)
      // 2) a '=': the attribute has a content which may contain SGML entities too
      
         while (*Statement && !isspace(*Statement) && *Statement!='=')
            ++Statement;

         while (*Statement && isspace(*Statement))
         {
            if (*Statement == (unsigned char) 10)
               ++row;

            *Statement++='\0'; // Close the attribute string
         }

         if (ptmp == Statement) // No attribute !!!
         {
            // Hey guys, if statement is not empty, this may
            // represent a malformed tag. Let's show it!
            if (*Statement)
               malformed_tag = true;
             
            *Statement='\0';
            continue;
         }

         // Reset htmlattribute variable
         htmlattribute.Reset();
   
         // Set the IDUrl for the HtmlAttribute object
         htmlattribute.SetIDUrl(htmlstatement.GetIDUrl());

         // Set the tag position
         htmlattribute.SetTagPosition(TagPosition);

         // Set the attribute position
         htmlattribute.SetAttrPosition(++AttrPosition);

         bool has_content = false;
	 // Store attribute is set according to the 'store_only_links' value
         store_statement = !CurrentScheduler->Config->Boolean("store_only_links");
         
         if (*Statement && *Statement == '=')
         {
               has_content = true;  // Attribute has a content
               *Statement++='\0';
         }

         htmlattribute.SetAttribute((char *)ptmp);

         if (has_content)
         {
            // The content can be written inside '"' or not.
            // If yes we search for next '"', else for the first space.

            while(*Statement && (isspace(*Statement) || *Statement=='='))
            {
               if (*Statement == (unsigned char) 10)
                  ++row;
               ++Statement;   // Skip spaces after '=' or multiple '='
            }
            
            if (*Statement)
            {

               // Not empty content
               if (*Statement == '"' || *Statement == '\'')
               {

                  char qm=*Statement;  // Store the quotation mark
                  ++Statement;         // Skip quotation mark (' or ")

                  ptmp=Statement;      
                  
                  // Look for a closing quotation mark
                  Statement = (unsigned char *) strchr ((char *)ptmp, qm);
                  
                  if (Statement)
                  {
                     // Found.
                     *Statement = '\0';
                     ++Statement;
                  }
                  else
                  {
                     // Not found the closing quotation mark
                     // Everything is content
                     Statement=ptmp;
                     while (*Statement)
                     {
                        if (*Statement == (unsigned char) 10)
                           ++row;
                        ++Statement; // reach the end
                     }
                  }

                  // Set content                  
                  htmlattribute.SetContent((char *)ptmp);
                  
               }
               else
               {
                  // Content outside a quotation mark
                  ptmp=Statement;
               
                  // Content is considered until a whitespace or the end
                  // is reached.
               
                  while (*Statement && !isspace(*Statement))
                     ++Statement;
               
                  if (*Statement)
                  {
                     if (*Statement == (unsigned char) 10)
                        ++row;
                     *Statement='\0';
                     ++Statement;
                  }
               
                  htmlattribute.SetContent((char *)ptmp);
               
               }

            }

            // We got a HTML attribute with a content.
            // Let's find a Link
            
            switch(FindLink())
	    {
	       case HtmlParser_LinkFailed:   // insert of the link failed
	          return HtmlParser_LinkFailed;
		  break;
		  
	       case HtmlParser_NormalLink:   // it has a link   
	       case HtmlParser_DirectLink:   // ditto
	       case HtmlParser_Anchor: // we must store it
                  store_statement = true;   // the attribute contains a link
	          break;
		  
	       case HtmlParser_NoLink: // No Link. Do nothing
	       default:
	          break;
	    }

	    	// Accessibility checks
        	if (CurrentScheduler->Config->Boolean("accessibility_checks"))
			{
            	if (location & TAGimg || location & TAGinput)
            	{
               		store_statement = true;
               		// We are inside an IMG tag
               		if (! htmlattribute.GetAttribute().nocase_compare("alt"))
               		{
                  		// ALT specified
                  		tag_acheck |= ACHECKTAGalt;
						CurrentAltText = htmlattribute.GetContent();
   						AltAttrPosition = htmlattribute.GetAttrPosition();
               		}
            		if (location & TAGinput &&
               			!htmlattribute.GetAttribute().nocase_compare("type")
						&& !htmlattribute.GetContent().nocase_compare("image"))
					{
                  		// INPUT image specified
                  		tag_acheck |= ACHECKTAGinputimg;
					}
            	}
			}
         }

         // The attribute is stored if store attribute is set to true
         
         if (store_statement)
         {
            // The tag also has to be inserted
            if (!tag_stored)
            {
               // Database Insertion of the HtmlStatement object

      	       // Check if it fails
               if (!CurrentScheduler->GetDB()->Insert(htmlstatement))
	          return HtmlParser_StatementFailed; // Failed

               tag_stored = true;

            }

            // Database Insertion of the HtmlAttribute object
            if (!CurrentScheduler->GetDB()->Insert(htmlattribute))
	       return HtmlParser_AttributeFailed; // Failed

         }
                  
         while (*Statement && isspace(*Statement))
         {
            if (*Statement == (unsigned char) 10)
               ++row;
            ++Statement;   // goes on ...
         }
      }   
   }
   else
   {
       // Tag with No attributes
      if (store_statement)
      {
         // The tag also has to be inserted
         if (!CurrentScheduler->GetDB()->Insert(htmlstatement))
            return HtmlParser_StatementFailed; // Failed
      }

   }

   if (malformed_tag)
      return HtmlParser_MalformedTag;
   else if (store_statement && // Accessibility checks
      CurrentScheduler->Config->Boolean("accessibility_checks"))
   {
      // Accessibility checks
      if (location & TAGimg)
      {
         // Missing ALT (Open Accessibility Check: Code 1)
         if (!(tag_acheck & ACHECKTAGalt))
         {
            // The accessibility check needs to be inserted
            if (!InsertAccessibilityCheck(
               CurrentScheduler->CurrentUrl->GetID(), TagPosition, 0, 1))
                  return HtmlParser_AccessibilityCheckFailed; // Failed
         }
		 else
		 {
			unsigned altcheck = CheckAlt();
			// OAC #2
			if (altcheck & ALTsameasfile)
			{
            	// The accessibility check needs to be inserted
            	if (!InsertAccessibilityCheck(
               		CurrentScheduler->CurrentUrl->GetID(), TagPosition,
						AltAttrPosition, 2)) // Failed
                  			return HtmlParser_AccessibilityCheckFailed;
			}

			if (altcheck & ALTlong) // OAC #3
			{
            	// The accessibility check needs to be inserted
            	if (!InsertAccessibilityCheck(
               		CurrentScheduler->CurrentUrl->GetID(), TagPosition,
						AltAttrPosition, 3))
                  		return HtmlParser_AccessibilityCheckFailed; // Failed
			}

			// Empty ALT if image is used as an anchor - OAC #7
			if (altcheck & ALTempty && location & TAGlink)
			{
            	// The accessibility check needs to be inserted
            	if (!InsertAccessibilityCheck(
               		CurrentScheduler->CurrentUrl->GetID(), TagPosition,
						AltAttrPosition, 7))
                  		return HtmlParser_AccessibilityCheckFailed; // Failed
			}
		 }
      }
      else if (location & TAGhx)
      {
         if (CurrentHx > 1)
         {
            // Wrong header nesting (h2 after h1, h3 after h2, etc.)
            // OAC #37, 38, 39, 40, 41 
            if (!InsertAccessibilityCheck(
               CurrentScheduler->CurrentUrl->GetID(), TagPosition, 0, (35+CurrentHx)))
                  return HtmlParser_AccessibilityCheckFailed; // Failed
         }
      }
      else if (location & TAGb)
      {
         // B element should not be used (OAC #116)
         if (!InsertAccessibilityCheck(
            CurrentScheduler->CurrentUrl->GetID(), TagPosition, 0, 116))
               return HtmlParser_AccessibilityCheckFailed; // Failed
      }
      else if (location & TAGi)
      {
         // I element should not be used (OAC #117)
         if (!InsertAccessibilityCheck(
            CurrentScheduler->CurrentUrl->GetID(), TagPosition, 0, 117))
               return HtmlParser_AccessibilityCheckFailed; // Failed
      }
      else if (location & TAGblink)
      {
         // I element should not be used (OAC #27)
         if (!InsertAccessibilityCheck(
            CurrentScheduler->CurrentUrl->GetID(), TagPosition, 0, 27))
               return HtmlParser_AccessibilityCheckFailed; // Failed
      }
	  else if (location & TAGinput)
	  {
         // Missing ALT for input images (OAC #58)
         if (tag_acheck & ACHECKTAGinputimg)
		 {
			if (!(tag_acheck & ACHECKTAGalt))
		 	{
         		if (!InsertAccessibilityCheck(
            		CurrentScheduler->CurrentUrl->GetID(), TagPosition, 0, 58))
               		return HtmlParser_AccessibilityCheckFailed; // Failed
		 	}
		 	else
		 	{
				unsigned altcheck = CheckAlt();
				// OAC #61
				if (altcheck & ALTsameasfile)
				{
            		// The accessibility check needs to be inserted
            		if (!InsertAccessibilityCheck(
               			CurrentScheduler->CurrentUrl->GetID(), TagPosition,
							AltAttrPosition, 61)) // Failed
                  				return HtmlParser_AccessibilityCheckFailed;
				}

				if (altcheck & ALTlong) // OAC #60
				{
            		// The accessibility check needs to be inserted
            		if (!InsertAccessibilityCheck(
               			CurrentScheduler->CurrentUrl->GetID(), TagPosition,
							AltAttrPosition, 60))
                  			return HtmlParser_AccessibilityCheckFailed; // Failed
				}
				else if (altcheck & ALTlong) // OAC #59
				{
            		// The accessibility check needs to be inserted
            		if (!InsertAccessibilityCheck(
               			CurrentScheduler->CurrentUrl->GetID(), TagPosition,
							AltAttrPosition, 59))
                  			return HtmlParser_AccessibilityCheckFailed; // Failed
				}
		 	}
	  	}
	  }
   }

   return HtmlParser_OK;

}



// This method realize if a tag needs to be stored and if it contains
// a link inside. If yes it provides its storing.
// A value is returned, giving the calling function the idea
// of what happened inside.

HtmlParser::HtmlParser_Codes HtmlParser::FindLink ()
{

   String Tag = htmlstatement.GetTag();
   String Attribute = htmlattribute.GetAttribute();
   int is_a_link = 0; // Values: 0 - No Link ; 1 - Normal Link ; 2 - Direct Link
                            //   -1 : Anchor (no link)
   //cout << "TAG: " << Tag << " - LOCATION PRE: " << location << endl;
                            
   String Content(htmlattribute.GetContent());

   ///////
      //    'A href'
   ///////
      
   if (! Tag.nocase_compare("A") && ! Attribute.nocase_compare("href")) // A href
   {
         is_a_link = 1;
         location |= TAGlink;
		 LastLinkTagPosition = TagPosition; // set the tag position with the last link
         LinkDescription.trunc(); // first erase the description
   }

   ///////
      //    Any 'id' attribute or "A name" could be suitable for anchors settings
   ///////
   else if (! Attribute.nocase_compare("id") || // Any id attribute
      (! Tag.nocase_compare("A") && ! Attribute.nocase_compare("name"))) // A name
   {
      // It's a anchor. Let's decode it's SGML entities
      htmlattribute.SetContent(encodeSGML(htmlattribute.GetContent()));
      // And let's store it always ... even if it's not a link
      is_a_link = -1;   // Special case - not to be stored in the link table
   }

   ///////
      //    'META' tag
   ///////

   else if (! Tag.nocase_compare("META"))
   {
      if (! Attribute.nocase_compare("content")) // Here it's the info
      {
         Configuration attrs;
         
         attrs.NameValueSeparators("=");
         attrs.Add(htmlstatement.GetStatement());
         
         if (!attrs["http-equiv"].empty())
         {
            if (! mystrcasecmp(attrs["http-equiv"], "refresh"))
            {

               String tmp (htmlattribute.GetContent());
               char *q = (char *) mystrcasestr((char *)tmp, "url=");

               if (q)
               {
                  // Found a Meta 'refresh' directive
                  
                  if (debug > 4)
                     cout << " META refresh found. " << endl;
               
                  q+=3; // skipping "URL"

      	          // And any junk space between 'URL' and '=' and after
      	          while (*q && ((*q == '=') || isspace(*q)))
                  {
                     if (*q == (unsigned char) 10)
                        ++row;
                     ++q;
                  }
		  
                  char *qq = q;
                  while (*qq && (*qq != ';') && (*qq != '"') &&
                     !isspace(*qq)) ++qq;
                  
                  *qq = 0;
                  
                  is_a_link = 1;

                  Content = q;
                  
               }
            }
	    else if (! mystrcasecmp(attrs["http-equiv"], "content-type"))
	    {
               String tmp (htmlattribute.GetContent());
               char *q = (char *) mystrcasestr((char *)tmp, "charset=");

               if (q)
	       {
                  // Found a Meta 'content-type' directive
                  
                  if (debug > 4)
                     cout << " META content-type found. " << endl;
               
                  q+=7; // skipping "charset"

      	          // And any junk space between 'charset' and '=' and after
      	          while (*q && ((*q == '=') || isspace(*q)))
                  {
                     if (*q == (unsigned char) 10)
                        ++row;
                     ++q;
                  }
		  
                  char *qq = q;
                  while (*qq && !isspace(*qq))
                      ++qq;
                  
                  *qq = 0;
                  
		  Charset = q; // Set the Charset
	       }
	    }
      }
            else if (! mystrcasecmp(attrs["name"], "description"))
                Description = htmlattribute.GetContent(); // Set the description
            else if (! mystrcasecmp(attrs["name"], "keywords"))
                Keywords = htmlattribute.GetContent(); // Set the keywords
         }
   }

   ///////
      //    'HTML' tag
   ///////
   else if (! Tag.nocase_compare("HTML"))
   {
      // Set the document language
      if (! Attribute.nocase_compare("lang")) // FRAME src
			  DocLanguage = htmlattribute.GetContent();
   }

   ///////
      //    'FRAME' tag
   ///////
   else if (! Tag.nocase_compare("FRAME"))
   {
      if (! Attribute.nocase_compare("src")) // FRAME src
         is_a_link = 1;
   }

   ///////
      //    'EMBED' tag
   ///////
   else if (! Tag.nocase_compare("EMBED"))
   {
      if (! Attribute.nocase_compare("src")) // EMBED src
         is_a_link = 2; // Direct Link
   }

   ///////
      //    'OBJECT' tag
   ///////
   else if (! Tag.nocase_compare("OBJECT"))
   {
      if (! Attribute.nocase_compare("src")) // OBJECT src
         is_a_link = 2; // Direct Link
      else if (! Attribute.nocase_compare("data")) // OBJECT data
         is_a_link = 2; // Direct Link
   }

   ///////
      //    'IMG' tag
   ///////
   else if (! Tag.nocase_compare("IMG"))
   {
      location |= TAGimg; // within an image
      if (! Attribute.nocase_compare("src")) // IMG src
	  {
         CurrentResourceRef = Content;
         is_a_link = 2; // Direct Link
	  }
	  else if (! Attribute.nocase_compare("lowsrc")) // IMG lowsrc
         is_a_link = 2; // Direct Link
   }

   ///////
      //    'AREA' tag
   ///////
   else if (! Tag.nocase_compare("AREA"))
   {
      if (! Attribute.nocase_compare("href")) // AREA href
         is_a_link = 1;
   }

   ///////
      //    'LINK' tag
   ///////
   else if (! Tag.nocase_compare("LINK"))
   {
      if (! Attribute.nocase_compare("href")) // LINK href
         is_a_link = 1;
   }
   ///////
      //    'INPUT' tag
   ///////
   else if (! Tag.nocase_compare("INPUT"))
   {
		if (tag_type & TAGstart)
		{
			location |= TAGinput;
			if (! Attribute.nocase_compare("src")) // IMG src
			{
				CurrentResourceRef = Content;
				is_a_link = 2; // Direct Link
			}
		}
        else if (tag_type & TAGend)
        	location &= ~TAGinput;
   }
   ///////
      //    'BASE' tag (Ugly command!)  ;-) 
   ///////
   else if (! Tag.nocase_compare("BASE"))
   {
      if (! Attribute.nocase_compare("href")) // BASE href
      {
         // Let's define a new BASE Url, used for resolving
         // relative URIs. I don't know who can use this, but HTML 4.0
         // enables it.

         if (BaseUrl != CurrentScheduler->CurrentUrl)
            delete BaseUrl; // Base Url different from CurrentUrl. So delete it.

         BaseUrl = new _Url (encodeSGML(Content),
            *(CurrentScheduler->CurrentUrl));

         if (BaseUrl)
         {
            if (debug > 0)      
               cout << " New Base Url for relative URIs: "
                  << BaseUrl->get() << endl;
         }
         else BaseUrl = CurrentScheduler->CurrentUrl;

      }
   }

   ///////
      //    Let's store any other 'href' attribute
   ///////
   else if (! Attribute.nocase_compare("href"))
      is_a_link = 1;
   
   ///////
      //    Let's store any other 'src' attribute
   ///////
   else if (! Attribute.nocase_compare("src"))
      is_a_link = 1;

   ///////
      //    Let's store any 'background' attribute (BODY, TABLE, etc ...)
   ///////
   else if (! Attribute.nocase_compare("background"))
         is_a_link = 2; // Direct Link


   // Let's store the links

   if (is_a_link > 0)
   {

      const String EncodedContent(encodeSGML(Content)); // Encoded URL (SGML)
      bool bad_encoded = false;

      if (mystrncasecmp("javascript:", EncodedContent, 11))
      {
         String UrlEncodedContent(EncodedContent);
         static String allowed_chars((*CurrentScheduler->Config)["url_reserved_chars"]);

         encodeURL(UrlEncodedContent, allowed_chars);  // Encoded URL (URL)
   
         // Let's check whether the URL is not well encoded
         if (EncodedContent.compare(UrlEncodedContent))
         {
            if (debug > 0)
            {
               cout << " ! URL not perfectly encoded: " << Content << " rather than "
               << UrlEncodedContent << endl;
            }

            bad_encoded = true;  // Bad encoding of the URL
         }
      }

      _Url *DestUrl = new _Url (EncodedContent,
            *BaseUrl);

      if (DestUrl)
      {

         unsigned int IDUrlDest; // Valid referenced Url
         
         CurrentScheduler->AddUrl(DestUrl->get(), IDUrlDest);

         if (debug > 3)      
            cout << htmlattribute.GetContent() << " -> "
               << DestUrl->get() << endl;

         link.Reset();     // reset the previous link object
            
         // Set the source Url ID
         link.SetIDUrlSrc(CurrentScheduler->CurrentUrl->GetID());            

         // Set the dest Url ID
         link.SetIDUrlDest(IDUrlDest);
            
         // Set the tag position
         link.SetTagPosition(htmlstatement.GetTagPosition());

         // Set the attribute position
         link.SetAttrPosition(htmlattribute.GetAttrPosition());

         if (bad_encoded)
            link.SetLinkResult("BadEncoded");

         // Set the anchor field, if a '#' is present in the
         // HTML attribute's content
         int position;  // position of '#' inside a URL

         if ((position=htmlattribute.GetContent().lastIndexOf('#'))!=-1)
         {
            // There's an anchor
            link.SetAnchor(encodeSGML(
               htmlattribute.GetContent().sub(position+1)));
         }
            
         // Set the Link Type
         switch(is_a_link)
         {
            case 1:
               link.SetLinkType("Normal");
               break;
            case 2:
               link.SetLinkType("Direct");
               break;
         }

         // Let's check whether it regards a 'file://' call 
         // which is certainly broken, or an e-mail address
         
         if (CurrentScheduler->CurrentLinkSchedule.GetStatus()
            == SchedulerEntry::Url_FileProtocol)
         {
            // Hey, there's a 'file://' call, it's an error!
            
            link.SetLinkResult("Broken");
            if (debug > 2)      
               cout << " 'file:/' link, error!" << endl;
         }
         else if (CurrentScheduler->CurrentLinkSchedule.GetStatus()
            == SchedulerEntry::Url_Malformed)
         {
            // Hey, there's a malformed URL, it's an error!
            
            link.SetLinkResult("Broken");
            if (debug > 2)      
               cout << " link to a malformed URL, error!" << endl;
         }
         else if (CurrentScheduler->CurrentLinkSchedule.GetStatus()
            == SchedulerEntry::Url_EMail)
         {
            // There's an e-mail address!
            link.SetLinkResult("EMail");
            if (debug > 2)      
               cout << " e-mail address!" << endl;
         }
         else if (CurrentScheduler->CurrentLinkSchedule.GetStatus()
            == SchedulerEntry::Url_Javascript)
         {
            // There's a Javascript inserted through the pseudo-protocol
	    // that is to say 'javascript:'
            link.SetLinkResult("Javascript");
            if (debug > 2)      
               cout << " link to Javascript URL "
	          << "(through the 'javascript:' pseudo-protocol)!" << endl;
         }
         
	 // Update the Domain information for the link
	 switch(CurrentScheduler->CurrentLinkSchedule.GetDomain())
	 {
	    case SchedulerEntry::Url_External:
	       link.SetLinkDomain(Link::Link_External);
	       break;
	    case SchedulerEntry::Url_Internal:
	       if (CurrentScheduler->CurrentLinkSchedule.GetIDServer()
	          == CurrentScheduler->CurrentUrl->GetIDServer())
	          link.SetLinkDomain(Link::Link_SameServer);
	       else
   	          link.SetLinkDomain(Link::Link_Internal);
	       break;
	    case SchedulerEntry::Url_Unknown:
   	          link.SetLinkDomain(Link::Link_Unknown);
	 }
	 
         // Write the link object
         if (!CurrentScheduler->GetDB()->Insert(link))
            return HtmlParser_LinkFailed;
            
      }
               
      delete DestUrl;

   }

   //cout << "TAG: " << Tag << " - LOCATION POST: " << location << endl;
   switch (is_a_link)
   {
      case 0:
      	 return HtmlParser_NoLink;
	 break;
      case 1:
      	 return HtmlParser_NormalLink;
	 break;
      case 2:
      	 return HtmlParser_DirectLink;
	 break;
      case -1:
      	 return HtmlParser_Anchor;
	 break;
   }

   // We should not get up to here, anyway this avoid warning messages
   return HtmlParser_NoLink;
   
}


const String HtmlParser::encodeSGML(const String &str)
{

   return HtSGMLCodec::instance()->encode(str);
   
}

const String HtmlParser::decodeSGML(const String &str)
{

   return HtSGMLCodec::instance()->decode(str);
   
}


int HtmlParser::CheckTag(char *tag)
{

   // More controls in order to decide which tags to store
   if (debug > 5)
      cout << "Checking tag: " << tag << endl;
   
   ///////
      //    'HEAD' tag
   ///////
   if (!mystrncasecmp(tag, "HEAD", 4))
   {
        if (tag_type & TAGstart)
            location |= TAGhead;
        else if (tag_type & TAGend)
            location &= ~TAGhead;
   }
   ///////
      //    'SCRIPT' tag
   ///////
   else if (!mystrncasecmp(tag, "SCRIPT", 6))
   {
        if (tag_type & TAGstart)
            location |= TAGscript;
        else if (tag_type & TAGend)
            location &= ~TAGscript;
   }
   ///////
      //    'TITLE' tag
   ///////
   else if (!mystrncasecmp(tag, "TITLE", 5))
   {
        if (location & TAGhead)
		{
            if (tag_type & TAGstart)
			{
                location |= TAGtitle;
				doc_acheck |= ACHECKDOCtitle;
			}
            else if (tag_type & TAGend)
                location &= ~TAGtitle;
		}
   }
   ///////
      //    'A' tag
   ///////
   else if (!mystrncasecmp(tag, "A", 1))
   {
        if (tag_type & TAGend)
            location &= ~TAGlink;
   }

   ////////////////////////////////////////////////////////////
   ////////////////////////////////////////////////////////////
   // Accessibility Checks
   ////////////////////////////////////////////////////////////
   ////////////////////////////////////////////////////////////
   if (!CurrentScheduler->Config->Boolean("accessibility_checks"))
		return 1;

   ///////
      //    'Hx' tag
   ///////
   if (strlen(tag) == 2 && !mystrncasecmp(tag, "H", 1))
   {
		if (tag_type & TAGstart)
		{
			location |= TAGhx;
			CurrentHx = atoi((const char *)(htmlstatement.GetTag()) +1);

			if (CurrentHx > 0 && CurrentHx < 7)
			{
				if (CurrentHx - PreviousHx > 1)
					store_statement = true;
				else
                   CurrentHx = 1;
				PreviousHx = CurrentHx;
			}
		}
        else if (tag_type & TAGend)
        	location &= ~TAGhx;
   }
   ///////
      //    'B' tag
   ///////
   else if (strlen(tag) == 1 && !mystrncasecmp(tag, "B", 1))
   {
		if (tag_type & TAGstart)
			location |= TAGb;
        else if (tag_type & TAGend)
        	location &= ~TAGb;
   }
   ///////
      //    'I' tag
   ///////
   else if (strlen(tag) == 1 && !mystrncasecmp(tag, "I", 1))
   {
		if (tag_type & TAGstart)
			location |= TAGi;
        else if (tag_type & TAGend)
        	location &= ~TAGi;
   }
   ///////
      //    'BLINK' tag
   ///////
   else if (strlen(tag) == 5 && !mystrncasecmp(tag, "BLINK", 5))
   {
		if (tag_type & TAGstart)
			location |= TAGblink;
        else if (tag_type & TAGend)
        	location &= ~TAGblink;
   }

   return 1;

}

// Insert an accessibility check record into the database
bool HtmlParser::InsertAccessibilityCheck(unsigned int idurl, unsigned int tagposition,
         unsigned int attrposition, unsigned int code)
{
	// Accessibility Check object
	AccessibilityCheck accessibilitycheck;

	// Set the parameters
    accessibilitycheck.SetIDCheck(AccessibilityCheck::GetLastID() +1);
	accessibilitycheck.SetIDUrl(idurl);
	accessibilitycheck.SetTagPosition(tagposition);
	accessibilitycheck.SetAttrPosition(attrposition);
	accessibilitycheck.SetCode(code);
	// Updates the check ID (counter)
	AccessibilityCheck::SetLastID(accessibilitycheck.GetIDCheck());

	// The accessibility check needs to be inserted
	return CurrentScheduler->GetDB()->Insert(accessibilitycheck);
}

// Returns the length of an SGML string stripping consecutive spaces
unsigned HtmlParser::CountSGMLStringLength(const char* str)
{
	unsigned counter(0);
	for (const char* p = str; p && *p; ++p)
	{
		// Ignore consecutive and initial spaces
		if (isspace(*p))
		{
			if (!counter || isspace(* (p-1)))
				continue;
		}
		++counter;
	}

	return counter;
}

// Returns an integer with results of a check regarding an ALT text
unsigned HtmlParser::CheckAlt()
{
	unsigned rv(0);

	// Prepare the pointers to the resource string
	char* begin_r = (char*) CurrentResourceRef;
	char* r = (char*) CurrentResourceRef
		+ CurrentResourceRef.length() - 1;

	// Prepare the pointers to the ALT string
	char* begin_a = (char*) CurrentAltText;
	char* a = (char*) CurrentAltText
		+ CurrentAltText.length() - 1;

	// Skip last spaces (Resource reference)
	for (; r > begin_r && *r && isspace(*r); --r)
		*r = '\0';
			
	// Get the file system part
	for (; r > begin_r && *r && *r != '/' && !isspace(*r); --r);

	// If there is a character, we move to the following one
	if (r > begin_r)
		++r;

	// Skip last spaces (ALT)
	for (; a > begin_a && *a && isspace(*a); --a)
		*a = '\0';
			
	// Skip initial spaces (ALT)
	for (a = begin_a; *a && isspace(*a); ++a);
			
	// ALT text same as file name
	if (strlen(r) == strlen(a) && !mystrncasecmp(r, a, strlen(r)))
		rv |= ALTsameasfile;

	//cout << "CurrentAltText: " << CurrentAltText << endl;
	//cout << "CurrentResourceRef: " << CurrentResourceRef << endl;
	//cout << "My ref: " << r << endl;
	//cout << "My ALT: " << a << endl;

	// Encode the ALT and count its length
	const String EncodedAlt(encodeSGML(a)); // Encoded ALT (SGML)
	unsigned counter = CountSGMLStringLength((const char*)
		EncodedAlt);

	// ALT longer than 150 characters
	if (counter >= 150)
		rv |= ALTlong;
	else if (!counter)
		rv |= ALTempty;

	return rv;
}

