#Adds a root the input non-well formed XML file and split it to generate one well-formed, OAI-PMH compliant XML file for each article node.
#Outputs are saved under the directory ./SPLITTED/$ARGV[1]
#Args: 
#$ARGV[0] = path to the file to split (with no root).
#$ARGV[1] = the string to use as subfolder where to store the splitted xml files.
#Developed by Alessia Bardi, 4 February 2013
#EBI, Literature Services

use strict;
use utf8;
use Encode qw(encode_utf8);
use warnings;
use XML::LibXML;
use XML::LibXSLT;
use POSIX;
use Encode;
use File::Path;
use MIME::Base64 qw(encode_base64);
use Storable;
binmode(STDOUT, ":utf8");

my $parser    = XML::LibXML->new();

#arg0 is the path to the file to split (with no root)
my $file=$ARGV[0]; 
#arg1 is the string to use as subfolder where to store the splitted xml files.
my $outdir=$ARGV[1];

mkdir "./SPLITTED/".$outdir;

open my $in,  '<',  "$file"      or die "Can't read old file: $!";
open my $out, '>', "$file.new" or die "Can't write new file: $!";

#add the root
print $out '<articles>';
while( <$in> ) { print $out $_;}
print $out "</articles>";
close $out;
    
#now we can parse the file and split it
my $root= $parser->parse_file("$file.new");		
my @nodes= $root->findnodes('//article');
my $i=0;
my $size= scalar(@nodes);
my $count = 0;
my $preval=-1;
    
foreach my $node  (@nodes)
{
	my $document = XML::LibXML::Document->new( '1.0', 'UTF-8' );
	my $oaiNode = XML::LibXML::Element->new( 'record' );
	my $oaiHeader = XML::LibXML::Element->new( 'header' );
	my $pmcid = $node->find('./pmcid/text()');
	$oaiHeader->appendTextChild( 'identifier' , $pmcid);
	my $pp='./SPLITTED/'.$outdir.'/'.$pmcid.".xml";
	my $oaiMetadata = XML::LibXML::Element->new( 'metadata' );
	$oaiMetadata->appendChild($node);
	$oaiNode->appendChild($oaiHeader);
	$oaiNode->appendChild($oaiMetadata);
	
	$document->setDocumentElement($oaiNode);			
	$document->toFile($pp,1);
	$count= $count +1;
	my $tmp= floor(($count / $size)*100);
	if ($preval != $tmp)
	{
	    print "\n";
	    print "Progress : $tmp\% \n"; 
	    $preval=$tmp;
	}
	$i++; 			
}	
	
	
