package eu.dnetlib.functionality.index.utils;

import org.apache.oro.text.perl.Perl5Util;

import eu.dnetlib.miscutils.functional.UnaryFunction;

/**
 * This function removes extra highlight tags from the given document, 
 * according to the CLEAN_REGEX regular expression
 * 
 * @param document
 * 			the document 
 * @return
 * 			cleaned document
 * 
 * @author claudio
 *
 */
public class HighlightUtils implements UnaryFunction<String, String> {
	
	public final static String DEFAULT_HL_PRE  = "[hl]";
	
	public final static String DEFAULT_HL_POST = "[/hl]";

	private static String CLEAN_HEADER = "s#\\[/?hl\\]##gm"; 
	private static String CLEAN_REGEX_OPEN = "<([^>]*)\\[hl\\]([^>]*)>";
	private static String CLEAN_REGEX_CLOSE = "<([^>]*)\\[\\/hl\\]([^>]*)>";
	
//	private static String CLEAN_REGEX_OPEN = "s#<([^>]*)\\[hl\\]([^>]*)>#<$1$2>#gm";
//	private static String CLEAN_REGEX_CLOSE = "s#<([^>]*)\\[\\/hl\\]([^>]*)>#<$1$2>#gm";	
	
	private Perl5Util p5util = new Perl5Util();
	
	@Override
	public String evaluate(String doc) {
		String[] chunk = doc.split("</header>");
		String string = cleanHeader(chunk[0]) + "</header>" + cleanBody(chunk[1]);
		return  string;
	}
	
	private String cleanHeader(String header) {
		return p5util.substitute(CLEAN_HEADER, header);
	}
	
	//TODO: implement a faster way to do this
	private String cleanBody(String body) {
		String res = body.replaceAll(CLEAN_REGEX_OPEN, "<$1$2>").replaceAll(CLEAN_REGEX_CLOSE, "<$1$2>");

		if (res.equals(body))
			return res;
		
		return cleanBody(res);
	}	
	
}
