#This script transform the input JATS XML-like files into a different format that preserves interesting 
#information (pmcid, text, mined accession numbers, accession numbers annotated by publishers). 
#Characters in the full-texts are encoded via regular expression in order to generate well-formed XML 
#(e.g., ''$t=~s/>/&gt;/g;'').

#Developed by Senay Kafkas, 31 January 2013
#Updated by Alessia Bardi
#EBI, Literature Services

use strict;

our $doc; my $line; my $cnt;

open(IN, "gunzip -c $ARGV[0] |") || die "can't open the file $ARGV[0]";

while ($line=<IN>)
{ 
   if ($line =~ /<!DOCTYPE /) 
   { 
     $cnt++;
     if ($doc ne "") 
     {process(); $doc="";}
     }
 
  else { $doc=$doc.$line; }
   
}
 
process();

close IN;



##############   subs  #######################

sub process {

    my $t; my @tdata; my @extlink; my $link; my @minedAccs; my $acc;
print '<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML">';
print "\n"; 
print "<pmcid>".&proc_pmcid()."</pmcid>\n";
@tdata=&proc_text();
@extlink = &proc_extlinks();
@minedAccs = &proc_minedAcc();
#added by alessia                                                                                                                                                                   
foreach $link (@extlink){
    print "<ext-link".$link."/>\n";
}
foreach $acc (@minedAccs){
    print "<acc".$acc."/>\n";
}
foreach $t (@tdata)
{
#removed by Alessia
# $t=~s/&amp;/&/g;
# $t=~s/&#x0026;/&/g;
# $t=~s/&#x00026;/&/g;
 $t=~s/&#x000ed;/í/g;
 $t=~s/&#x000e3;/ã/g;
 $t=~s/&#x000e9;/é/g;
 $t=~s/&#x000e4;/ä/g;
 $t=~s/&#x000e0;/à/g;
 $t=~s/&#x000d6;/ö/g;
# removed by Alessia
# $t=~s/&#x0003e;/>/g;
 $t=~s/&#x000fc;/ü/g;
 $t=~s/&#x0002f;/\//g;
 $t=~s/&#x02019;/’/g;
 $t=~s/&#x02013;/–/g;
#added by Alessia
 $t=~s/</&lt;/g;
 $t=~s/>/&gt;/g;
 $t=~s/&#x0003e;/&gt;/g;
print "<text>".$t."</text>\n";
}

print "</article>\n";
}


sub proc_pmcid {
  if ($doc =~ /<article-id pub-id-type=\"pmcid\">(\S+?)<\/article-id>/) { 

 return $1;
  }

  else {return "";}
}

sub proc_text {
my @textdata=();

while ($doc =~ /<plain>(.*?)<\/plain>/g) { push (@textdata, $1);}
return @textdata;
}

sub proc_extlinks{
#<ext-link ext-link-type="uri" xlink:href="http://breast-cancer-research.com/vol1no1/01sep99/research/2"/>
#<ext-link ext-link-type="uri" xlink:href="http://breast-cancer-research.com/vol1no1/01sep99/research/2">foo</ext-link> 
    my @extlinks=();
    while($doc =~ /<ext-link(.*?)\/?>/g) {push (@extlinks, $1);} 
return @extlinks;
}

sub proc_minedAcc{                                                              
#<z:acc db="gen" ids="AAR08135">AAR08135</z:acc>
    my @minedAcc=();
    while($doc =~ /<z:acc(.*?)\/?>/g) {push (@minedAcc, $1);}
    return @minedAcc;
}
