package eu.dnetlib.index.utils;

import java.util.function.UnaryOperator;

/**
 * This function removes extra highlight tags from the given document,
 * according to the CLEAN_REGEX regular expression
 *
 * @author claudio
 * @return cleaned document
 */
public class HighlightUtils implements UnaryOperator<String> {

	public final static String DEFAULT_HL_PRE = "[hl]";

	public final static String DEFAULT_HL_POST = "[/hl]";

	private static String CLEAN_HEADER = "s#\\[/?hl\\]##gm";
	private static String CLEAN_REGEX_OPEN = "<([^>]*)\\[hl\\]([^>]*)>";
	private static String CLEAN_REGEX_CLOSE = "<([^>]*)\\[\\/hl\\]([^>]*)>";

	//TODO: implement a faster way to do this
	private String cleanBody(String body) {
		String res = body.replaceAll(CLEAN_REGEX_OPEN, "<$1$2>").replaceAll(CLEAN_REGEX_CLOSE, "<$1$2>");

		if (res.equals(body))
			return res;

		return cleanBody(res);
	}

	@Override
	public String apply(final String doc) {
		String[] chunk = doc.split("</header>");
		String string = chunk[0].replaceAll("\\[/?hl\\]", "") + "</header>" + cleanBody(chunk[1]);
		return string;
	}
}
