package eu.dnetlib.iis.utils.contents; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.HashSet; import java.util.Set; import org.apache.avro.file.DataFileWriter; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import eu.dnetlib.iis.core.java.io.CloseableIterator; import eu.dnetlib.iis.core.java.io.DataStore; import eu.dnetlib.iis.core.java.io.FileSystemPath; import eu.dnetlib.iis.metadataextraction.schemas.DocumentText; import eu.dnetlib.iis.utils.contents.schemas.wos.WoS2OpenAIRE; /** * @author Mateusz Kobos */ public class WoSConverter { private final File input; private final File output; private final File outputMappingDir; public static void main(String[] args) throws IOException { WoSConverter converter = parse(args); converter.run(); } private static WoSConverter parse(String[] args) throws IOException { if (args.length != 3) { parseError("This program converts Web of Science metadata records " + "stored in TSV (tab separated values) file into a data store. " + "Two arguments should be given: 1) \"input file\" and " + "2) output dir for DocumentText data store; " + "3) output dir for data store with WoS and OpenAIRE IDs " + "outputMappingDir"); } String input = args[0]; String output = args[1]; String outputMappingDir = args[2]; return new WoSConverter(new File(input), new File(output), new File(outputMappingDir)); } private static void parseError(String error) { System.err.println("ERROR while parsing command line: " + error); System.exit(1); } public WoSConverter(File input, File output, File mapping) { this.input = input; this.output = output; this.outputMappingDir = mapping; } public void run() throws IOException{ WoSCSVReader reader = new WoSCSVReader(this.input); DataFileWriter writer = DataStore.create(new FileSystemPath(output), DocumentText.SCHEMA$); DataFileWriter mappingWriter = DataStore.create(new FileSystemPath(outputMappingDir), WoS2OpenAIRE.SCHEMA$); try { DocumentText docText = new DocumentText(); WoS2OpenAIRE idMap = new WoS2OpenAIRE(); Set alreadyProcessedWOSIds = new HashSet(); while(reader.hasNext()){ WoSRecord record = reader.next(); String wosId = record.id; if(alreadyProcessedWOSIds.contains(wosId)){ continue; } alreadyProcessedWOSIds.add(wosId); String openAIREId = convertToOpenAIREId(wosId); idMap.setWosId(wosId); idMap.setOpenAIREId(openAIREId); mappingWriter.append(idMap); docText.setId(openAIREId); docText.setText(record.fundingText); writer.append(docText); } } finally { if(writer != null){ writer.close(); } if (mappingWriter != null){ mappingWriter.close(); } if(reader != null){ reader.close(); } } } private static String convertToOpenAIREId(String wosId) { return Utils.convertToOpenAIREId("webcrawl____::", "", wosId); } } class WoSRecord{ public String id; public String fundingText; public WoSRecord(String id, String fundingText) { this.id = id; this.fundingText = fundingText; } } class WoSCSVReader implements CloseableIterator{ private static final String idColumnName = "UT"; private static final String textColumnName = "FX"; private final LineIterator iterator; private int lineNo = 0; private int idColumnIndex; private int textColumnIndex; private int columnsCount = -1; public WoSCSVReader(File file){ try { this.iterator = FileUtils.lineIterator(file, "UTF-8"); } catch (IOException e) { throw new RuntimeException(e); } this.lineNo = 0; } @Override public boolean hasNext() { return this.iterator.hasNext(); } @Override public WoSRecord next() { lineNo++; try{ String line = this.iterator.nextLine(); String[] rawFields = line.split("\t"); /** Handle header */ if(lineNo == 1){ idColumnIndex = getFieldIndex(rawFields, idColumnName); textColumnIndex = getFieldIndex(rawFields, textColumnName); columnsCount = rawFields.length; return next(); } String[] fields = fixNumberOfColumns(rawFields, columnsCount); String id = fields[idColumnIndex]; if(id == null){ System.err.println("Skipping line "+lineNo+" since the field "+ "corresponding to ID (with column name "+ "\""+idColumnName+"\") is empty"); return next(); } return new WoSRecord(id, fields[textColumnIndex]); } catch(Exception ex) { throw new RuntimeException("ERROR in line "+lineNo+" of the "+ "read file: "+ex, ex); } } private static String[] fixNumberOfColumns(String[] elements, int expectedColumnsCount){ if(elements.length == expectedColumnsCount){ return elements; } else if(elements.length < expectedColumnsCount){ return Arrays.copyOf(elements, expectedColumnsCount); } else { throw new RuntimeException( String.format("The line contains %d columns but "+ "only %d were expected", elements.length, expectedColumnsCount)); } } private static int getFieldIndex(String[] fields, String value){ for(int i = 0; i < fields.length; i++){ if(fields[i].equals(value)){ return i; } } throw new RuntimeException(String.format( "Field \"%s\" not found ", value)); } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public void close() throws IOException { if(iterator != null){ iterator.close(); } } }