package eu.dnetlib.data.mapreduce.hbase.lodExport.utils; import eu.dnetlib.data.mapreduce.util.LicenseComparator; import eu.dnetlib.data.proto.DatasourceProtos.Datasource; import eu.dnetlib.data.proto.DatasourceProtos.Datasource.Metadata; import eu.dnetlib.data.proto.FieldTypeProtos; import eu.dnetlib.data.proto.FieldTypeProtos.StringField; import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; import eu.dnetlib.data.proto.OafProtos.Oaf; import eu.dnetlib.data.proto.OafProtos.OafEntity; import eu.dnetlib.data.proto.OafProtos.OafRel; import eu.dnetlib.data.proto.OrganizationProtos.Organization; import eu.dnetlib.data.proto.PersonProtos; import eu.dnetlib.data.proto.ProjectProtos.Project; import eu.dnetlib.data.proto.ResultProtos; import eu.dnetlib.data.proto.ResultProtos.Result; import eu.dnetlib.data.proto.ResultProtos.Result.Instance; import org.apache.log4j.Logger; import java.util.ArrayList; import java.util.List; /** * @author eri Simple serializer that parses input Oaf Protos and prepares them * for sqoop */ public class Serializer { private static String DELIM; private Logger log = Logger.getLogger(this.getClass()); private String NULL_STRING = ""; private static String SEPERATOR; private FundingParser fundingParser; public Serializer(String delim, String seperator) { this.setDELIM(delim); this.setSEPERATOR(seperator); this.fundingParser = new FundingParser(delim); } public String serialize(Oaf oaf) { switch (oaf.getKind()) { case entity: OafEntity valueEntity = oaf.getEntity(); switch (valueEntity.getType()) { case datasource: return buildDatasource(valueEntity); case organization: return buildOrganization(valueEntity); case project: return buildProject(valueEntity); case result: return buildResult(valueEntity); case person: return buildPerson(valueEntity); default: log.error("wrong type"); break; } break; case relation: OafRel valueRel = oaf.getRel(); return buildRel(valueRel); } return null; } public String serialize(OafRel oaf) { switch (oaf.getRelType()) { default: return buildRel(oaf); } } public ArrayList extractRelations(Oaf oaf) { OafEntity valueEntity = oaf.getEntity(); ArrayList relations = new ArrayList(); switch (valueEntity.getType()) { case result: relations.addAll(getResultDatasources(valueEntity)); relations.addAll(getDedups(valueEntity)); return relations; case datasource: relations.addAll(getDedups(valueEntity)); return relations; case person: relations.addAll(getDedups(valueEntity)); return relations; case organization: relations.addAll(getDedups(valueEntity)); return relations; default: return relations; } } private String buildRel(OafRel Rel) { switch (Rel.getRelType()) { case datasourceOrganization: return getStringField(Rel.getRelType().name()) + getStringField("datasource") + getStringField(Rel.getSource()) + getStringField("organization") + getStringField(Rel.getTarget()); case resultResult: return getStringField(Rel.getRelType().name()) + getStringField("result") + getStringField(Rel.getSource()) + getStringField("result") + getStringField(Rel.getTarget()); case personPerson: return getStringField(Rel.getRelType().name()) + getStringField(" person") + getStringField(Rel.getSource()) + getStringField(" person") + getStringField(Rel.getTarget()); case organizationOrganization: return getStringField(Rel.getRelType().name()) + getStringField("organization") + getStringField(Rel.getSource()) + getStringField("organization") + getStringField(Rel.getTarget()); case personResult: return getStringField(Rel.getRelType().name()) + getStringField("person") + getStringField(Rel.getSource()) + getStringField("result") + getStringField(Rel.getTarget()); case projectOrganization: return getStringField(Rel.getRelType().name()) + getStringField("project") + getStringField(Rel.getSource()) + getStringField("organization") + getStringField(Rel.getTarget()); case projectPerson: return getStringField(Rel.getRelType().name()) + getStringField("project") + getStringField(Rel.getSource()) + getStringField("person") + getStringField(Rel.getTarget()); case resultOrganization: return getStringField(Rel.getRelType().name()) + getStringField("result") + getStringField(Rel.getSource()) + getStringField("organization") + getStringField(Rel.getTarget()); case resultProject: return getStringField(Rel.getRelType().name()) + getStringField("result") + getStringField(Rel.getSource()) + getStringField("project") + getStringField(Rel.getTarget()); default: return null; } } private String getHeader(OafEntity data) { String buff = new String(); // EntityType buff += getStringField(data.getType().name()); // OpenaireID buff += cleanId(data.getId()) + DELIM; // dateOfTransformation // TODO change to dateOfTransformation here when released buff += getStringDateField(data.getDateofcollection()); // dateOfCollection buff += getStringDateField(data.getDateofcollection()); String originalId = new String(); // originalId for (String oid : data.getOriginalIdList()) { originalId += oid + SEPERATOR; } buff += originalId + DELIM; return buff; } private String buildDatasource(OafEntity data) { Datasource d = data.getDatasource(); Metadata metadata = d.getMetadata(); String buff = getHeader(data); //Datasourcetype if (metadata.hasDatasourcetype()) { buff += getStringField(metadata.getDatasourcetype().getClassname().replaceFirst(".*::", "")); } else { buff += getStringField(null); } //Openairecompatibility buff += getStringField(metadata.getOpenairecompatibility().getClassname()); //OfficialName buff += getStringField(metadata.getOfficialname().getValue()); // Englishname buff += getStringField(metadata.getEnglishname().getValue()); //Websiteurl buff += getStringField(metadata.getWebsiteurl().getValue()); //LogoURL buff += getStringField(metadata.getLogourl().getValue()); //Contactemail buff += getStringField(metadata.getContactemail().getValue()); //Namespaceprefix buff += getStringField(metadata.getNamespaceprefix().getValue()); // latitude buff += getStringField(metadata.getLatitude().getValue()); // longtitude buff += getStringField(metadata.getLongitude().getValue()); // dateofvalidation, buff += getStringDateField(metadata.getDateofvalidation().getValue()); //Description buff += getStringField(metadata.getDescription().getValue()); //subjects String subj = new String(); for (StructuredProperty s : metadata.getSubjectsList()) { subj += clean(s.getValue()) + SEPERATOR; } //subjectList buff += getMultipleStringField(subj); //Number of items buff += getStringField(metadata.getOdnumberofitems().getValue()); //Date of number of items buff += getStringDateField(metadata.getOdnumberofitemsdate().getValue()); // Policies buff += getStringField(metadata.getOdpolicies().getValue()); //languages String languages = new String(); for (StringField lang : metadata.getOdlanguagesList()) { languages += clean(lang.getValue()) + SEPERATOR; } buff += getMultipleStringField(languages); // Content type String contentType = new String(); for (StringField c : metadata.getOdcontenttypesList()) { contentType += clean(c.getValue()) + SEPERATOR; } buff += getMultipleStringField(contentType); //Access info package String accessInfo = new String(); for (StringField c : metadata.getAccessinfopackageList()) { accessInfo += clean(c.getValue()) + SEPERATOR; } buff += getMultipleStringField(accessInfo); //Release start date buff += getStringDateField(metadata.getReleasestartdate().getValue()); //Release end date buff += getStringDateField(metadata.getReleaseenddate().getValue()); //Mission statement url buff += getStringField(metadata.getMissionstatementurl().getValue()); //Data provider buff += getStringField(String.valueOf(metadata.getDataprovider().getValue())); //Service provider buff += getStringField(String.valueOf(metadata.getServiceprovider().getValue())); //Database access type buff += getStringField(metadata.getDatabaseaccessrestriction().getValue()); //Data upload type buff += getStringField(metadata.getDatauploadtype().getValue()); //Data upload restrictions buff += getStringField(metadata.getDatauploadrestriction().getValue()); //Versioning buff += getStringField(String.valueOf(metadata.getVersioning().getValue())); //Citation guideline url buff += getStringField(String.valueOf(metadata.getCitationguidelineurl().getValue())); //Quality management kind buff += getStringField(String.valueOf(metadata.getQualitymanagementkind().getValue())); //PID systems buff += getStringField(metadata.getPidsystems().getValue()); //Certificates buff += getStringField(metadata.getCertificates().getValue()); //Policies String policies = new String(); for (FieldTypeProtos.KeyValue property : metadata.getPoliciesList()) { policies +=clean(property.getValue()) + SEPERATOR; } buff += getMultipleStringField(policies); buff += getTrust(data); return buff; } private String buildOrganization(OafEntity data) { Organization organization = data.getOrganization(); eu.dnetlib.data.proto.OrganizationProtos.Organization.Metadata metadata = organization.getMetadata(); String buff = getHeader(data); //getLegalshortname buff += getStringField(metadata.getLegalshortname().getValue()); // `name`, buff += getStringField(metadata.getLegalname().getValue()); //website URL String [] split=metadata.getWebsiteurl().getValue().split(","); String urls= new String(); for(String s: split) { urls+= s + SEPERATOR;} buff += getMultipleStringField(urls); //logourl buff += getStringField(metadata.getLogourl().getValue()); // `country`, buff += getStringField(metadata.getCountry().getClassid()); buff += getTrust(data); return buff; } String getTrust(OafEntity data) { for (FieldTypeProtos.ExtraInfo info : data.getExtraInfoList()) { return getStringField(info.getTrust()); } return NULL_STRING + DELIM; } private String buildResult(OafEntity data) { Result result = data.getResult(); eu.dnetlib.data.proto.ResultProtos.Result.Metadata metadata = result.getMetadata(); String buff = getHeader(data); // titleString String titleString = new String(); String alternativeTitles = new String(); for (int i = 0; i < metadata.getTitleList().size(); i++) { StructuredProperty title = metadata.getTitleList().get(i); if (i == 0) { titleString = title.getValue().replaceAll("\\s+", " "); titleString = titleString.replaceAll("\n", " "); } else { alternativeTitles += title.getValue().replaceAll("\\s+", " ") + " ; "; alternativeTitles = alternativeTitles.replaceAll("\n", " "); } break; } // pubtitle buff += getStringField(titleString); // date of acceptance CHANGED THIS TO DATE FORMAT buff += getStringDateField(metadata.getDateofacceptance().getValue()); // publisher buff += getStringField(metadata.getPublisher().getValue()); //PID String pids = new String(); for (StructuredProperty p : data.getPidList()) { pids += clean(p.getValue()) + SEPERATOR; } buff += getMultipleStringField(pids); //language buff += getStringField(metadata.getLanguage().getClassid()); // RelevantDate String reldate = new String(); for (StructuredProperty p : metadata.getRelevantdateList()) { reldate += p.getValue(); break; } buff += getStringDateField(reldate); //Subject String subjects = new String(); for (StructuredProperty subj : metadata.getSubjectList()) { if (subj.getValue() != null && !subj.getValue().isEmpty()) { subjects += clean(subj.getValue()) + SEPERATOR; } } buff += getMultipleStringField(subjects); //TODO ExternalReference buff += getStringField(" "); //Source String source = new String(); for (StringField s : metadata.getSourceList()) { source += clean(s.getValue()) + SEPERATOR; } buff += getMultipleStringField(source); //TODO Format      buff += getStringField(""); //DOES NOT EXIST /*String formatString = new String(); for (StringField format : metadata.getFormatList()) { formatString = format.getValue(); break;} buff += getStringField(formatString); }*/ //Context String context = new String(); for (Result.Context s : metadata.getContextList()) { context += s.getId() + SEPERATOR; } buff += getMultipleStringField(context); //country TODO does not exist; throws error String country = new String(); // for (Qualifier s : metadata.getCountryList()) { // country += s.getClassname() + ";"; // } buff += getStringField(country); //Best License buff += getStringField(getBestLicense(result)); //Description String description = new String(); for (StringField desc : metadata.getDescriptionList()) { description += desc; break; } buff += getStringField(description); //Journal   buff += getStringField(metadata.getJournal().getName().replaceAll("\n", " ")); //#null#! // TODO ERI SOS : HERE IN GET JOUTNAL. GET DATA INFO I CAN FIND PROVENANCE AND SIMILARITY // TODO isRelatedTo // resource type buff += getStringField(metadata.getResourcetype().getClassname()); // device buff += getStringField(metadata.getDevice().getValue()); // size buff += getStringField(metadata.getSize().getValue()); // version buff += getStringField(metadata.getVersion().getValue()); // metadata update buff += getStringDateField(metadata.getLastmetadataupdate().getValue()); // metadata version buff += getStringField(metadata.getMetadataversionnumber().getValue()); // year buff += getYearInt(metadata.getDateofacceptance().getValue()); // type buff += getStringField(metadata.getResulttype().getClassname()); /* //classification String classification = new String(); for (Instance instance : (result.getInstanceList())) { if (instance.getInstancetype().getClassname() != null && !instance.getInstancetype().getClassname().isEmpty()) { classification += instance.getInstancetype().getClassname() + ';'; } } buff += getStringField(classification);*/ buff += getTrust(data); return buff; } private String buildProject(OafEntity data) { Project project = data.getProject(); eu.dnetlib.data.proto.ProjectProtos.Project.Metadata metadata = project.getMetadata(); String buff = getHeader(data); //Code buff += getStringField(metadata.getCode().getValue()); // `Websiteurl`, buff += getStringField(metadata.getWebsiteurl().getValue()); // `Acronym`, buff += getStringField(metadata.getAcronym().getValue()); //Titlem buff += getStringField(metadata.getTitle().getValue()); // Startdate buff += getNumericField(metadata.getStartdate().getValue()); // Enddate buff += getNumericField(metadata.getEnddate().getValue()); //`Call identifer` buff += getStringField(metadata.getCallidentifier().getValue()); //`KeyWords` buff += getStringField(metadata.getKeywords().getValue()); //`Duration` buff += getStringField(metadata.getDuration().getValue()); //esc39 buff += getStringField(metadata.getEcsc39().getValue().toString()); //`Contracttype` buff += getStringField(metadata.getContracttype().getClassname()); //`OA mandate pubs` TODO DOES NOT EXIST buff += getStringField(metadata.getOamandatepublications().getValue()); // buff += getStringField(""); //`Subjects` TODO DOES NOT EXIST EITHER String subjects = new String(); for (StructuredProperty s : metadata.getSubjectsList()) { subjects += clean(s.getValue()) + SEPERATOR; } buff += getMultipleStringField(subjects); //`EC293` buff += getStringField(metadata.getEcarticle293().getValue()); List fundList = metadata.getFundingtreeList(); if (!fundList.isEmpty()) // `funding_lvl0`, { //TODO funder + 3 funding levels /* funder text, funding_lvl0 text, funding_lvl1 text, funding_lvl2 text, funding_lvl3 text,*/ buff += this.fundingParser.getFundingInfo(fundList.get(0).getValue()); } else { buff += this.fundingParser.getFundingInfo(""); } buff += getTrust(data); return buff; } private String buildPerson(OafEntity data) { PersonProtos.Person person = data.getPerson(); eu.dnetlib.data.proto.PersonProtos.Person.Metadata metadata = person.getMetadata(); String buff = getHeader(data); // `firstname`, buff += getStringField(metadata.getFirstname().getValue()); // `secondNames`, String secondNames = new String(); for (StringField s : metadata.getSecondnamesList()) { secondNames += s.getValue() + ' '; } buff += getStringField(secondNames); // `fullname`, buff += getStringField(metadata.getFullname().getValue()); // `Fax`, buff += getStringField(metadata.getFax().getValue()); // `Email`, buff += getStringField(metadata.getEmail().getValue()); // `Phone`, buff += getStringField(metadata.getPhone().getValue()); // `Nationality`, buff += getStringField(metadata.getNationality().getClassid()); // `PIDS`, String pids = new String(); for (StructuredProperty s : data.getPidList()) { pids += s.getValue() + ";"; } buff += getStringField(pids); buff += getTrust(data); return buff; } private ArrayList getResultDatasources(OafEntity valueEntity) { ArrayList buffs = new ArrayList(); Result result = valueEntity.getResult(); //TODO hosted by for (Instance instance : (result.getInstanceList())) { String hostedBy = instance.getHostedby().getKey(); if (hostedBy != null && !hostedBy.isEmpty()) { buffs.add(getStringField("resultDatasource") + getStringField("result") + getStringField(cleanId(valueEntity.getId())) + getStringField("datasource") + getStringField(hostedBy)); } } //TODO collected froms for (FieldTypeProtos.KeyValue collectedFromValue : (valueEntity.getCollectedfromList())) { String collectedFrom = collectedFromValue.getKey(); if (collectedFrom != null && !collectedFrom.isEmpty()) buffs.add((getStringField("resultDatasource") + getStringField("result") + getStringField(cleanId(valueEntity.getId())) + getStringField("datasource") + getStringField(collectedFrom))); } return buffs; } public String cleanId(String value) { if (value != null) { // DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____:: // to datacite____:: ) // AND REPLACES OCCURENCES OF DELIM CHARS IN DATA value = value.replaceFirst(".*\\|", ""); value = value.replaceAll("\n", ""); value = value.replaceAll(DELIM, " "); value = value.replaceAll(SEPERATOR, " "); value = value.trim(); } return value; } //TODO make them in pairs private ArrayList getDedups(OafEntity valueEntity) { ArrayList buffs = new ArrayList(); if (!valueEntity.getChildrenList().isEmpty()) { String header = getStringField("dedup") + getStringField(valueEntity.getType().name()) + getStringField(cleanId(valueEntity.getId())) + getStringField(valueEntity.getType().name()); for (OafEntity child : valueEntity.getChildrenList()) { { if (child.getType() == valueEntity.getType()) { // if it is result, then its the deduplication buffs.add(header + getStringField(cleanId(child.getId()))); } } } } return buffs; } private String getNumericField(String data) { if (data == null || data.isEmpty() || data.equals("")) { return NULL_STRING + DELIM; } else { return data + DELIM; } } private String getYearInt(String data) { if (data == null || data.isEmpty() || data.equals("-1")) { return NULL_STRING + DELIM; } String[] split = data.split("-"); if (split == null || split.length == 0) { return NULL_STRING + DELIM; } String year = split[0]; year = cleanNumber(year); return year + DELIM; } private static String cleanNumber(String number) { number = number.replaceAll("[^A-Za-z0-9:,____]", ""); return number; } private String getStringField(String data) { if (data == null || data.isEmpty() || data.equals("")) { return NULL_STRING + DELIM; } else { String field = clean(data); if (field == null) { return NULL_STRING + DELIM; } else { return field + DELIM; } } } private String getMultipleStringField(String data) { if (data == null || data.isEmpty() || data.equals("")) { return NULL_STRING + DELIM;} return data + DELIM; } private String getStringDateField(String data) { if (data == null || data.isEmpty() || data.equals("") || data.equals("-1")) { return NULL_STRING + DELIM; } return data + DELIM; } return cleanId(oaf.getRel().getSource()); } return null; } private String getBestLicense(ResultProtos.Result result) { FieldTypeProtos.Qualifier bestLicense = null; LicenseComparator lc = new LicenseComparator(); for (ResultProtos.Result.Instance instance : (result.getInstanceList())) { if (lc.compare(bestLicense, instance.getLicence()) > 0) { bestLicense = instance.getLicence(); } } if (bestLicense != null) { return bestLicense.getClassname(); } else { return null; } } private String clean(String value) { if (value != null) { // TODO DO NOT CHANGE THIS: IT REMOVES ID PREFIX ( 5|datacite____:: // to datacite____:: ) // AND REPLACES OCCURENCES OF DELIM CHARS IN DATA value = value.replaceFirst(".*\\|", ""); value = value.replaceAll(DELIM, " "); value =value.toString().replaceAll("\\\\", " "); value = value.replaceAll("\"", ""); value = value.replaceAll(SEPERATOR, " "); value = value.replaceAll("\\r\\n|\\r|\\n", " "); value = value.replaceAll("\\s+", " "); value = value.replaceAll("(\\r|\\n)", " "); value = value.replaceAll("\\t", " "); value = value.replaceAll("\n", " "); value = value.replaceAll("[^a-zA-Z0-9 .-_:/]+", ""); value= value.replace("\\", " "); } return value; } public String getDELIM() { return DELIM; } public void setDELIM(String dELIM) { DELIM = dELIM; } public static String getSEPERATOR() { return SEPERATOR; } public static void setSEPERATOR(String SEPERATOR) { Serializer.SEPERATOR = SEPERATOR; } }