package gr.uoa.di.resourcediscovery.methods;

import gr.uoa.di.resourcediscovery.MalformedConfigurationException;
import gr.uoa.di.resourcediscovery.MethodProvider;
import gr.uoa.di.resourcediscovery.Toolkit;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.apache.log4j.Logger;
import org.archive.modules.net.RobotsDirectives;
import org.archive.modules.net.Robotstxt;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.NodeIterator;
import org.xml.sax.SAXException;

public class XPathAndCrawl implements ResourceDiscoveryMethod {

	transient Logger logger = Logger.getLogger(XPathAndCrawl.class);

	private boolean resolveFrames = true;
	private boolean skipFirstPage = false;
	private long sleepMillis = 100;
	private boolean ignoreRobotsTxt = false;
	private String agentName = "OpenAIRE_Harvester";
	private List<String> mimeTypes = new ArrayList<String>();
	private boolean fallback = true;
	private String robotstxtUrl = null;

	transient private Robotstxt robot = null;
	transient private RobotsDirectives directives = null;

	private List<String> xpaths = new ArrayList<String>();

	public XPathAndCrawl() {
		this.ignoreRobotsTxt = true;
	}

	// you need one per repository!
	public XPathAndCrawl(List<String> mimeTypes, String robotstxtUrl) throws FileNotFoundException, IOException {
		this.mimeTypes.addAll(mimeTypes);

		if (robotstxtUrl != null) {
			URL url = new URL(robotstxtUrl);
			try {
				BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
				this.robot = new Robotstxt(in);
				this.directives = this.robot.getDirectivesFor(agentName);
			} catch (FileNotFoundException ex) {
				logger.debug("Robots.txt was not found at " + robotstxtUrl);
				ignoreRobotsTxt = true;
			}
		} else {
			ignoreRobotsTxt = true;
		}
	}

	public void setRobotstxt(String robotstxtUrl) throws FileNotFoundException, IOException {
		this.robotstxtUrl = robotstxtUrl;
		if (robotstxtUrl != null) {
			URL url = new URL(robotstxtUrl);
			try {
				BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
				this.robot = new Robotstxt(in);
				this.directives = this.robot.getDirectivesFor(agentName);
			} catch (FileNotFoundException ex) {
				logger.debug("Robots.txt was not found at " + robotstxtUrl);
				ignoreRobotsTxt = true;
			}
		} else {
			ignoreRobotsTxt = true;
		}
	}

	public String getRobotstxtUrl() {
		return robotstxtUrl;
	}

	@Override
	public List<String> getResources(URL upageUrl, MethodProvider provider) throws SAXException, IOException {

		String pageUrl = upageUrl.toString();
		
		logger.debug("Known xpaths: "+this.xpaths);
		
		pageUrl = Toolkit.getRedirectedUrl(pageUrl, this.sleepMillis);
		
		logger.debug("Resolved possible redirections. Url: "+pageUrl);
		
		List<String> ret = new ArrayList<String>();
		List<String> urls = new ArrayList<String>();
		urls.add(pageUrl);
		
		// check if url is a redirection
		
		
		if(this.mimeTypes.contains(Toolkit.getMimeType(pageUrl, this.sleepMillis))) {
			ret.add(Toolkit.makeAbsolute(pageUrl, new URL(pageUrl)));
			return ret;
		}

		if (this.resolveFrames) {
			DOMParser parser = new DOMParser();
			parser.parse(pageUrl);
			Document doc = parser.getDocument();
			urls.addAll(resolveFrames(doc, new URL(pageUrl)));
			logger.debug("urls after resolving frames: " + urls);
		}

		if (this.skipFirstPage) {
			List<String> addme = new ArrayList<String>();
			for (String url : urls) {
				DOMParser parser = new DOMParser();
				parser.parse(url);
				Document doc = parser.getDocument();
				addme.addAll(oneDepthDown(doc, new URL(url)));
			}

			urls.remove(pageUrl);

			if (this.resolveFrames) {
				for (String url : urls) {
					DOMParser parser = new DOMParser();
					parser.parse(url);
					Document doc = parser.getDocument();
					addme.addAll(resolveFrames(doc, new URL(url)));
				}
			}

			urls.addAll(addme);
			logger.debug("urls after skipping 1st page and resolving frames: " + urls);
		}

		for (String url : urls) {
			logger.debug("looking for resource in: " + url);
			try {
				url = Toolkit.makeAbsolute(url, new URL(pageUrl));
			} catch (Exception e) {
				e.printStackTrace();
				continue;
			}
			URL startingUrl = new URL(url);

			if (!this.ignoreRobotsTxt)
				if (!this.directives.allows(Toolkit.makeRelative(startingUrl))) {
					logger.debug("Skipping " + startingUrl + ". Disallowed by robots.txt directives.");
					continue;
				}

			if (this.xpaths.size() == 0) {
				logger.debug("No xpath information, crawling");
				// this for the first time
				DOMParser parser = new DOMParser();
				parser.parse(startingUrl.toString());
				Document doc = parser.getDocument();

				List<Node> resourceNodes = findNodesWithResource(doc, startingUrl);

				for (Node resourceNode : resourceNodes) {
					String xp = getXpathToRoot(resourceNode);
					xpaths.add(xp);
					logger.debug(xp);
				}
				
				try {
					URL methodUrl = new URL(pageUrl);
					provider.setMethod(new URL(methodUrl.getProtocol()+"://"+methodUrl.getHost()), this);
				} catch(MalformedConfigurationException e) {
					logger.error("Error updating xpath information", e);
				}

				for (String xp : xpaths) {
					String resourceUrl = getResourceUrl(xp, doc, startingUrl);
					if (resourceUrl != null) {
						logger.debug(resourceUrl);
						ret.add(resourceUrl);
					}
				}
			} else {
				// this is for the rest of the pages of the repo
				DOMParser parser = new DOMParser();
				parser.parse(startingUrl.toString());
				Document doc = parser.getDocument();

				for (String xp : xpaths) {
					String resourceUrl = getResourceUrl(xp, doc, startingUrl);
					if (resourceUrl != null) {
						logger.debug(resourceUrl);
						ret.add(resourceUrl);
					}
				}
			}
		}

		if (ret.size() == 0 && this.fallback) {
			// if no xpath contained the resource, try to find it and add
			// all the xpaths
			for (String url : urls) {
				logger.debug("looking for resource in (not found in xpath): " + url);

				try {
					url = Toolkit.makeAbsolute(url, new URL(pageUrl));
				} catch (Exception e) {
					e.printStackTrace();
					continue;
				}
				URL startingUrl = new URL(url);

				if (!this.ignoreRobotsTxt)
					if (!this.directives.allows(Toolkit.makeRelative(startingUrl))) {
						logger.debug("Skipping " + startingUrl + ". Disallowed by robots.txt directives.");
						continue;
					}

				DOMParser parser = new DOMParser();
				parser.parse(startingUrl.toString());
				Document doc = parser.getDocument();
				List<Node> resourceNodes = findNodesWithResource(doc, startingUrl);
				for (Node resourceNode : resourceNodes) {
					String xp = getXpathToRoot(resourceNode);
					xpaths.add(xp);
					logger.debug(xp);
				}
				
				try {
					URL methodUrl = new URL(pageUrl);
					provider.setMethod(new URL(methodUrl.getProtocol()+"://"+methodUrl.getHost()), this);
				} catch(MalformedConfigurationException e) {
					logger.error("Error updating xpath information", e);
				}
				
				for (String xp : xpaths) {
					String resourceUrl = getResourceUrl(xp, doc, startingUrl);
					if (resourceUrl != null) {
						logger.debug(resourceUrl);
						ret.add(resourceUrl);
					}
				}
			}
		}

		return ret;
	}

	private List<String> resolveFrames(Document doc, URL connectionUrl) {
		List<String> ret = new ArrayList<String>();

		DocumentTraversal traversal = (DocumentTraversal) doc;

		NodeIterator iterator = null;
		try {
			iterator = traversal.createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, null, true);
		} catch (Exception e) {
			e.printStackTrace();
			return ret;
		}

		for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) {
			if (n.getNodeName().equals("FRAME") || n.getNodeName().equals("IFRAME")) {
				String url = n.getAttributes().getNamedItem("src").getNodeValue();
				try {
					url = Toolkit.makeAbsolute(url, connectionUrl);
					ret.add(url);
				} catch (MalformedURLException ex) {
					continue;
				}
			}
		}
		return ret;
	}

	private List<String> oneDepthDown(Document doc, URL connectionUrl) throws IOException {
		List<String> ret = new ArrayList<String>();

		DocumentTraversal traversal = (DocumentTraversal) doc;

		NodeIterator iterator = null;
		try {
			iterator = traversal.createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, null, true);
		} catch (Exception e) {
			e.printStackTrace();
			return ret;
		}

		for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) {
			if (n.getNodeName().equals("A")) {
				String url = n.getAttributes().getNamedItem("href").getNodeValue();
				try {
					url = Toolkit.makeAbsolute(url, connectionUrl);
					if (Toolkit.getMimeType(url, this.sleepMillis).trim().contains("text/html"))
						ret.add(url);
				} catch (MalformedURLException ex) {
					continue;
				}
			}
		}
		return ret;
	}

	private String getXpathToRoot(Node node) {
		String xpath = "";
		do {
			if (node.getNodeName().equals("HTML")) {
				int before = 1;
				while ((node = node.getPreviousSibling()) != null)
					before++;
				return "/HTML["+before+"]" + xpath;
			}
			int before = 0;
			Node current = node;
			while ((current = current.getPreviousSibling()) != null)
				if (current.getNodeName().equals(node.getNodeName()))
					before++;
			xpath = "/" + node.getNodeName() + "[" + (before + 1) + "]" + xpath;
		} while ((node = node.getParentNode()) != null);
		return xpath;
	}

	private List<Node> findNodesWithResource(Document doc, URL connectionUrl) throws IOException {
		List<Node> ret = new ArrayList<Node>();

		DocumentTraversal traversal = (DocumentTraversal) doc;

		NodeIterator iterator = null;
		try {
			iterator = traversal.createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, null, true);
		} catch (Exception e) {
			e.printStackTrace();
			return ret;
		}

		for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) {
			if (n.getNodeName().equals("A")) {
				String url = null;
				try {
					url = n.getAttributes().getNamedItem("href").getNodeValue();
				} catch(NullPointerException e) {
					// anchor without href
					continue;
				}
				if (url == null)
					continue;
				try {
					url = Toolkit.makeAbsolute(url, connectionUrl);
					if (this.mimeTypes.contains(Toolkit.getMimeType(url, this.sleepMillis).trim()))
						ret.add(n);
				} catch (MalformedURLException ex) {
					continue;
				}
			}
		}
		return ret;
	}

	private String getResourceUrl(String xpath, Document doc, URL url) throws MalformedURLException {
		try {
			Node current = doc.getFirstChild();
			String[] elements = xpath.split("/");
			for (String element : elements) {
				if (element.trim().equals(""))
					continue;
				int position = Integer.parseInt(element.substring(element.indexOf('[')).replaceAll("\\[", "").replaceAll("\\]", ""));
				String name = element.substring(0, element.indexOf('['));
				int found = 0;
				do {
					if (current.getNodeName().equals(name)) {
						found++;
						if (found == position) {
							current = current.getFirstChild();
							break;
						}
					}
				} while ((current = current.getNextSibling()) != null);

			}
			String ret = current.getParentNode().getAttributes().getNamedItem("href").getNodeValue();
			return Toolkit.makeAbsolute(ret, url);
		} catch (Exception e) {
			return null;
		}
	}

	private Object readResolve() throws IOException {
		if (robotstxtUrl != null) {
			URL url = new URL(robotstxtUrl);
			BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
			this.robot = new Robotstxt(in);
			this.directives = this.robot.getDirectivesFor(agentName);
		} else {
			ignoreRobotsTxt = true;
		}
		logger = Logger.getLogger(XPathAndCrawl.class);
		return this;
	}

	public boolean isResolveFrames() {
		return resolveFrames;
	}

	public void setResolveFrames(boolean resolveFrames) {
		this.resolveFrames = resolveFrames;
	}

	public boolean isSkipFirstPage() {
		return skipFirstPage;
	}

	public void setSkipFirstPage(boolean skipFirstPage) {
		this.skipFirstPage = skipFirstPage;
	}

	public long getSleepMillis() {
		return sleepMillis;
	}

	public void setSleepMillis(long sleepMillis) {
		this.sleepMillis = sleepMillis;
	}

	public List<String> getMimeTypes() {
		return mimeTypes;
	}

	public void setMimeTypes(List<String> mimeTypes) {
		this.mimeTypes = mimeTypes;
	}

	public List<String> getXpaths() {
		return xpaths;
	}

	public void setXpaths(List<String> xpaths) {
		this.xpaths = xpaths;
	}

	public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) {
		this.ignoreRobotsTxt = ignoreRobotsTxt;
	}

	public boolean isIgnoreRobotsTxt() {
		return ignoreRobotsTxt;
	}

	public void setAgentName(String agentName) {
		this.agentName = agentName;
		this.directives = this.robot.getDirectivesFor(agentName);
	}

	public String getAgentName() {
		return agentName;
	}

	public void setFallback(boolean fallback) {
		this.fallback = fallback;
	}

	public boolean isFallback() {
		return fallback;
	}

}
