/***************************************************************************
 *                       StreamHTML.cpp  -  description
 *                               -------------------
 *  begin                : Tue March 1 10:40:21 BST 2003
 *  copyright            : (C) 2002 by Dmitri Skachkov
 *  email                : d_skachkov@yahoo.com
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/


#include "StreamHTML.h"

StreamHTML::~StreamHTML()
{
}

QChar StreamHTML::isTag(QChar t)
{
    QChar gt;
    if ((t != '<') && (t !='&'))
    {
	if (tag.head) return QChar();
	if (tag.script) return QChar();
    	if (t == '\n')
    	{
	    /*if (reparagraph)
	    {
		//newLine = false;
		return ' ';
	    } else if (tag.pre)*/
    	    QChar ch;
    	    QString s;
    	    //QString spaceRegExp = "^ ";
    	    int c,p;
    	    p = getPosition();
    	    c = 10;
    	    s = "";
    	    if (reparagraph)
    	    {
		if (textstream->atEnd())
		{
	    	    newLine = true;
	    	    return QChar();
		}
		while (1)
		{
	    	    textstream->operator>>(ch);
	    	    if (recode) ch = decoder->getQChar(ch);
	    	    if (textstream->atEnd())
	    	    {
			newLine = true;
			return QChar();
	    	    }
	    	    if (stripCR && (ch == '\r')) continue;
	    	    if (ch == '\r') ch = '\n';
	    	    break;
		}
		if (ch == '\n')
		{
	    	    newLine = true;
	    	    newParagraph = true;
	    	    return QChar();
		}
		s.append(ch);
        	while (c)
        	{
	    	    if (textstream->atEnd()) break;
    	    	    textstream->operator>>(ch);
	    	    if (stripCR && (ch == '\r')) continue;
	    	    if (ch == '\r') ch = '\n';
	    	    s.append(ch);
	    	    --c;
        	}
		//printf("-->%s<--\n",s.latin1());
		if (s.find(QRegExp("^\\s*[\\-\"`']"),0)>-1)
		{
	    	    newLine = true;
	    	    setPosition(p);
	    	    //printf("1\n");
	    	    return QChar();
		}
		/*if (removeSpacePads && spacePadded)
		{
	    	    for (unsigned int i=0;i<spacePadded;i++)
	    	    {
			spaceRegExp += " ";		
	    	    }
	    	    if (s.find(QRegExp(spaceRegExp),0)>-1)
	    	    {
			newParagraph = true;
			newLine = true;
			setPosition(p);
			return QChar();
	    	    }
		} else if (s.find(QRegExp("^\\s+"),0)>-1)*/
		if (s.find(QRegExp("^\\s+"),0)>-1)
		{
	    	    newParagraph = true;
	    	    newLine = true;
	    	    setPosition(p);
	    	    //printf("2\n");
		    return QChar();
		}
    	    } else if (tag.pre)
	    {
	    	newLine = true;
	    	return QChar();
	    }
	setPosition(p);
	    return ' ';
    	}
	return t;
    }
    gt = getTag(t);
    if (tag.head) return QChar();
    if (tag.script) return QChar();
    return gt;
}

QChar StreamHTML::getTag(QChar t)
{
    if (t == '&')
    {
	bool ok;
	tempPos = getPosition();
	QString num = "";
	QString entity = "";
	int i = 16;
	while (--i)
	{
	    textstream->operator>>(tc);
	    if (tc == ';') break;
	    entity.append(tc.lower());
	}
	if (!i)
	{
	    setPosition(tempPos);
	    return t;
	}
	/*if (entity == "lt") return QChar('<');
	if (entity == "gt") return QChar('>');
	if (entity == "quot") return QChar('"');
	if (entity == "nbsp") return QChar(' ');
	if (entity == "amp") return QChar('&');*/
	if (entity.at(0) == '#')
	{
	    c = QChar(entity.mid(1).toUShort(&ok,10));
	    if (ok) return c;
	} else {
	    c = entityToChar(entity);
	    return c;
	}
	setPosition(tempPos);
	return t;
    } else {
	//newParagraph = false;
	tagString = "";
	while (1)
	{
	    if (atEnd()) break;
	    //tempPos = getPosition();
	    textstream->operator>>(tc);
	    if (tc == '>') break;
	    tagString.append(tc);
	}
	if (tagString.at(tagString.length()-1) == '/')
	{
	    tagString.truncate(tagString.length()-1);
	}
	//setPosition(tempPos);
	tagString = tagString.stripWhiteSpace();
	tagStringLc = tagString.lower();
	tagStart = true;
	if (tagStringLc.at(0) == '/')
	{
	    tagStart = false;
	    tagStringLc.remove(0,1);
	    tagString.remove(0,1);
	    tagName = tagStringLc;
	} else {
	    tagName = tagStringLc;
	    tagName.replace(QRegExp(" .*$"),"");
	}
	if (tagName == "p")
	{
	    //printf("<p>");
	    tag.p = tagStart;
	    newParagraph = tagStart;
	    newLine = true;
	} else if (tagName == "br")
	{
	   tag.br = tagStart;
	   newLine = tagStart;
	} else if (tagName == "tr")
	{
	    tag.br = tagStart;
	    newLine = tagStart;
	} else if (tagName == "hr")
	{
	    newParagraph = true;
	    newLine = true;
	} else if (tagName == "ul")
	{
	    tag.ul = tagStart;
	} else if (tagName == "li")
	{
	    //newLine = true;
	    tag.li = tagStart;
	} else if (tagName == "ol")
	{
	    tag.ol = tagStart;
	} else if (tagName == "dl")
	{
	    tag.dl = tagStart;
	} else if (tagName == "pre")
	{
	    tag.pre = tagStart;
	    newLine = tagStart;
	} else if (tagName == "strong")
	{
	    tag.strong = tagStart;
	} else if (tagName == "i")
	{
	    tag.i = tagStart;
	} else if (tagName == "b")
	{
	    tag.b = tagStart;
	} else if (tagName == "tt")
	{
	    tag.tt = tagStart;
	} else if (tagName == "big")
	{
	    tag.big = tagStart;
	} else if (tagName == "small")
	{
	    tag.small = tagStart;
	} else if (tagName == "em")
	{
	    tag.em = tagStart;
	} else if (tagName == "dfn")
	{
	    tag.dfn = tagStart;
	} else if (tagName == "samp")
	{
	    tag.samp = tagStart;
	} else if (tagName == "kbd")
	{
	    tag.kbd = tagStart;
	} else if (tagName == "var")
	{
	    tag.var = tagStart;
	} else if (tagName == "cite")
	{
	    tag.cite = tagStart;
	} else if (tagName == "del")
	{
	    tag.del = tagStart;
	} else if ((tagName == "s") || (tagName == "strike"))
	{
	    tag.strike = tagStart;
	} else if (tagName == "u")
	{
	    tag.u = tagStart;
	} else if (tagName == "a")
	{
	    tag.a = tagStart;
	    tag.href="";
	    if (tagStringLc.find(QRegExp("href="),0)>-1)
	    {
		unsigned int i;
		i = tagStringLc.find(QRegExp("href="),0) + 5;
		while((i<tagString.length()) && (!tagString.at(i).isSpace()))
		{
		    tag.href.append(tagString.at(i++));
		}
		(tag.href.stripWhiteSpace());
		if (tag.href.at(0) == '\"') tag.href = tag.href.mid(1);
		if (tag.href.at(tag.href.length()-1) == '\"') tag.href= tag.href.left(tag.href.length()-1);
		if (tag.href.at(0) == '\'') tag.href = tag.href.mid(1);
		if (tag.href.at(tag.href.length()-1) == '\'') tag.href= tag.href.left(tag.href.length()-1);
	    }
	} else if (tagName == "link")
	{
	    tag.link = tagStart;
	} else if (tagName == "tag")
	{
	    tag.html = tagStart;
	} else if (tagName == "head")
	{
	    tag.head = tagStart;
	} else if (tagName == "title")
	{
	    tag.title = tagStart;
	} else if (tagName == "body")
	{
	    tag.body = tagStart;
	} else if (tagName == "script")
	{
	    tag.script = tagStart;
	} else if (tagName == "div")
	{
	    tag.div = tagStart;
	    newLine = tagStart;
	} else if (tagName.find(QRegExp("^h[1-6]$")) > -1)
	{
	    //newParagraph = tagStart;
	    newLine = true;
	    if (tagStart)
	    {
		newParagraph = true;
		tag.h = int((7 - tagName.at(1).digitValue())/2+1);
	    } else {
		//newLine = true;
		tag.h = 0;
	    }
	} else if (tagName == "code")
	{
	    tag.code = tagStart;
	} else if (tagName == "img")
	{
	    tag.imgAlt = "";
	    tag.imgSrc = "";
	    int src;
	    int alt;
	    tag.img = tagStart;
	    src = tagStringLc.find(QRegExp("src="));
	    alt = tagStringLc.find(QRegExp("alt="));
	    if (alt>-1)
	    {
		alt += 4;
		while (1)
		{
		    if ((unsigned (alt)>=tagString.length()) || (tagString.at(alt).isSpace())) break;
		    tag.imgAlt.append(tagString.at(alt));
		    ++alt;
		}
		
	    }
	    if (src>-1)
	    {
		src += 4;
		while (1)
		{
		    if ((unsigned (src)>=tagString.length()) || (tagString.at(src).isSpace())) break;
		    tag.imgSrc.append(tagString.at(src));
		    ++src;
		}
	    }
	    (tag.imgSrc.stripWhiteSpace());
	    if (tag.imgSrc.at(0) == '\"') tag.imgSrc = tag.imgSrc.mid(1);
	    if (tag.imgSrc.at(tag.imgSrc.length()-1) == '\"') tag.imgSrc= tag.imgSrc.left(tag.imgSrc.length()-1);
	    if (tag.imgSrc.at(0) == '\'') tag.imgSrc = tag.imgSrc.mid(1);
	    if (tag.imgSrc.at(tag.imgSrc.length()-1) == '\'') tag.imgSrc= tag.imgSrc.left(tag.imgSrc.length()-1);
	    (tag.imgAlt.stripWhiteSpace());
	    if (tag.imgAlt.at(0) == '\"') tag.imgAlt = tag.imgAlt.mid(1);
	    if (tag.imgAlt.at(tag.imgAlt.length()-1) == '\"') tag.imgAlt= tag.imgAlt.left(tag.imgAlt.length()-1);
	    if (tag.imgAlt.at(0) == '\'') tag.imgAlt = tag.imgAlt.mid(1);
	    if (tag.imgAlt.at(tag.imgAlt.length()-1) == '\'') tag.imgAlt= tag.imgAlt.left(tag.imgAlt.length()-1);
	} else {
	}
	//printf("<");
	//printf(tagName);
	//printf(">");
	return QChar();	
    }
}

bool StreamHTML::openFile(const QString & filepath)
{
    QFile f;
    if ( !QFile::exists( filepath ) ) return false;
    f.setName( filepath ); 
    if (!f.open( IO_ReadOnly ))
    {
	return false;
    }
    fileOpened = true;
    f.close();
    if (file.isOpen()) file.close();
    file.setName( filepath ); 
    file.open( IO_ReadOnly );
    docSize = file.size();
    numberOfPages = int(docSize/2000) + 1;
    newParagraph = false;
    newLine = false;
    if (textstream) textstream->~QTextStream();
    textstream = new QTextStream(&file);
    resetTags();
    setEncoding(encoding);
    if (links) delete [] links;
    links = 0;
    linksFound = false;
    return true;
}

void StreamHTML::findLinks()
{
    if (linksFound) return;
    if (!textstream) return;
    //printf("Looking for links\n");
    int p;
    QChar ch;
    QString tag;
    linksCount = 0;
    tempLinks = new linksProps[3000];
    p = getPosition();
    setPosition(0);
    while (!atEnd())
    {
	textstream->operator>>(ch);
	if (ch != '<') continue;
	tag = "";
	while (1)
	{
	    if (atEnd()) break;
	    textstream->operator>>(ch);
	    if (ch == '>')
	    {
		if (tag.lower().find(QRegExp("^a\\s+name\\s*=\\s*\".+\""))>-1)
		{
		    tempLinks[linksCount].position = getPosition();
		    tempLinks[linksCount].name = "";
		    for (unsigned int i = (tag.find(QRegExp("\"")) + 1); i<tag.length();i++)
		    {
			if ((ch = tag.at(i)) == '"') break;
			tempLinks[linksCount].name.append(ch);
		    }
		    ++linksCount;
		}
		break;
	    } else {
		tag.append(ch);
	    }
	}
    }
    if (linksCount>0)
    {
	if (links) delete [] links;
	links = new linksProps[linksCount];
	for (int i = 0; i<linksCount;i++)
	{
	    links[i] = tempLinks[i];
	}
    }
    delete [] tempLinks;
    tempLinks = 0;
    linksFound = true;
    setPosition(p);
}

QString StreamHTML::findNextContent()
{
    int p;
    int page;
    QChar ch;
    QString tag;
    QString content,s;
    while(!atEnd())
    {
	textstream->operator>>(ch);
	if (ch != '<') continue;
	tag = "";
	while  (1)
	{
	    if (atEnd()) break;
	    textstream->operator>>(ch);
	    if (ch == '>')
	    {
		if (tag.lower().find(QRegExp("^a\\s+name\\s*=\\s*\".+\""))>-1)
		{
		    p = getPosition();
		    page = getPageNumber();
		    content = "";
		    for (int i=0;i<8;i++)
		    {
			s = readWordForward(true);
			//if (s == "") break;
			content += s + " ";
		    }
		    return content + "\n" + QString::number(page)+"\n"+QString::number(p)+"\n1\n";
		} else {
		    break;
		}
	    } else {
		tag.append(ch);
	    }
	}
    }
    return "";
}

int StreamHTML::getInLinkPosition(QString l)
{
    if (!linksFound) findLinks();
    if (!linksCount) return -1;
    for (int i = 0; i<linksCount;i++)
    {
	if (l != links[i].name) continue;
	return links[i].position;
    }
    return -1;
}
