package eu.dnetlib.iis.ingest.html;
import java.io.IOException;
import org.apache.avro.mapred.AvroKey;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Whitelist;
import eu.dnetlib.iis.metadataextraction.schemas.DocumentText;
/**
* Module ingesting plain text from HTML document.
* @author mhorst
*/
public class HtmlToPlaintextIngester extends Mapper, NullWritable, AvroKey, NullWritable> {
private final Logger log = Logger.getLogger(this.getClass());
@Override
protected void map(AvroKey key, NullWritable value, Context context)
throws IOException, InterruptedException {
DocumentText nlm = key.datum();
if (nlm.getText()!=null) {
final DocumentText.Builder output = DocumentText.newBuilder();
output.setId(nlm.getId());
try {
// skipping newlines
// output.setText(Jsoup.parse(nlm.getText().toString()).text());
// preserving newlines
output.setText(cleanNoMarkup(nlm.getText().toString()));
context.write(new AvroKey(output.build()),
NullWritable.get());
} catch (Exception e) {
log.error("exception thrown when trying to extract text representation "
+ "from html document identified with: " + nlm.getId(), e);
}
}
}
private static String cleanNoMarkup(String input) {
final Document.OutputSettings outputSettings = new Document.OutputSettings().prettyPrint(false);
String output = Jsoup.clean(input, "", Whitelist.none(), outputSettings);
return output!=null?output.replace(" ", ""):null;
}
}