|
[HepData-svn] r1605 - trunk/hepdata-webapp/src/main/java/cedar/hepdata/formatsblackhole at projects.hepforge.org blackhole at projects.hepforge.orgFri Nov 30 09:11:11 GMT 2012
Author: whalley Date: Fri Nov 30 09:11:11 2012 New Revision: 1605 Log: adding new MarcXML formatter for Inspire Added: trunk/hepdata-webapp/src/main/java/cedar/hepdata/formats/MarcXMLFormatter.java Added: trunk/hepdata-webapp/src/main/java/cedar/hepdata/formats/MarcXMLFormatter.java ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ trunk/hepdata-webapp/src/main/java/cedar/hepdata/formats/MarcXMLFormatter.java Fri Nov 30 09:11:11 2012 (r1605) @@ -0,0 +1,554 @@ +package cedar.hepdata.formats; + +import cedar.hepdata.model.*; +import cedar.hepdata.util.*; +import cedar.hepdata.xml.*; +import cedar.hepdata.db.*; +import cedar.hepdata.webapp.components.*; + + +import java.util.*; +import java.text.*; + +import org.antlr.stringtemplate.*; + +import com.Ostermiller.util.SignificantFigures; + + +public class MarcXMLFormatter { + + public static String format(Paper p) { + StringBuffer s = new StringBuffer(); + if(p == null) return null; + + s.append(_headerXML()); + for (Dataset ds : p.getDatasets()){ + s.append(_metadataXML(ds)); + } + + s.append(" <data>\n"); + for (Dataset ds : p.getDatasets()){ + s.append(" <dataset id=\""+ds.getId()+"\">\n"); + s.append(_numbersXML(ds)); + s.append(" </dataset>\n"); + } + s.append(" </data>\n"); + + s.append(_footerXML()); + return s.toString(); + } + + public static String _getTimestamp(){ + DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); + Date date = new Date(); + return dateFormat.format(date); + } + + + public static String _headerXML() { + StringBuffer s = new StringBuffer(); + s.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); + s.append("<collection xmlns=\"http://www.loc.gov/MARC21/slim\">\n"); + return s.toString(); + } + + public static String _footerXML() { + StringBuffer s = new StringBuffer(); + s.append("</collection>\n"); + return s.toString(); + } + + public static String _metadataXML(Dataset ds){ + StringBuffer s = new StringBuffer(); + int idbase=9000000; + int id=idbase+ds.getId(); + String location = ""; + Paper p = ds.getPaper(); + + s.append("<record>\n"); + s.append(" <controlfield tag=\"001\">"+id+"</controlfield>\n"); + s.append(" <controlfield tag=\"005\">"+ds.getPaper().getDateUpdated()+"</controlfield>\n"); + + s.append(" <datafield tag=\"245\" ind1=\"\" ind2=\"\">\n"); + s.append(" <subfield code=\"9\">HEPDATA</subfield>\n") ; + s.append(" <subfield code=\"a\">") ; + for (String ct : ds.getComments()){ + if(ct.startsWith("Location:")) { + location = ct.replaceFirst("Location:",""); + s.append(ct.replaceFirst("Location:","Data from")+" from: "+p.getTitle()); + } + } + s.append(" </subfield>\n"); + s.append(" </datafield>\n"); + + s.append(" <datafield tag=\"336\" ind1=\"\" ind2=\"\">\n"); + s.append(" <subfield code=\"t\">DATASET</subfield>\n") ; + s.append(" </datafield>\n"); + + s.append(" <datafield tag=\"520\" ind1=\"\" ind2=\"\">\n"); + s.append(" <subfield code=\"9\">HEPDATA</subfield>\n") ; + s.append(" <subfield code=\"h\">") ; + for (String ct : ds.getComments()){ + if(!ct.startsWith("Location:")) { + s.append(ct.replaceFirst("VERBATIM","")); + } + } + s.append(" </subfield>\n"); + s.append(" </datafield>\n"); + + int nxmax = ds.getXAxes().size(); + int nymax = ds.getYAxes().size(); + +// for (DatasetProperty dsprop : ds.getProperties()){ +// for (int nx=1; nx<=nxmax; nx++){ +// s.append(" <datafield tag=\"653\" ind1=\"\" ind2=\"\">\n"); +// s.append(" <subfield code=\"k\"/>\n") ; +// s.append(" <subfield code=\"v\"/>\n") ; +// s.append(" <subfield code=\"c\">"+nx+"</subfield>\n"); +// s.append(" </datafield>\n"); +// } +// } + int ncomm = 0; + int nprop = 0; + if (ds.getYAxes().size() > 0) { + for(YAxis yax : ds.getYAxes()){ + if(yax.getComments().size() > ncomm) {ncomm = yax.getComments().size();} + if(yax.getProperties().size() > nprop) {nprop = yax.getProperties().size();} + } + } + String[][] store = new String[nymax][nprop+ncomm]; + int numy=-1; + for(YAxis yax : ds.getYAxes()) { + numy++; + int j=-1; + List <String> storelist = new ArrayList(); + for (String comment : yax.getComments()){ + if(!comment.startsWith("RE") && comment.indexOf(" IN ") > 0 && comment.indexOf(" : ") > 0){ + int len = comment.length(); + int ip1 = comment.indexOf(" IN "); + int ip2 = comment.indexOf(" : "); + if(ip2 > ip1) { + String comment2 = comment.substring(0,ip1); + comment2 += comment.substring(ip2,len); + comment2 += comment.substring(ip1+3,ip2).replace("EV","eV"); + comment = comment2; + } + else{ + String comment2 = comment.substring(0,ip2); + comment2 += comment.substring(ip1,len); + comment2 += comment.substring(ip2+3,ip1).replace("EV","eV"); + comment = comment2; + } + } + if(comment.startsWith(". :")){ + comment = comment.replace(". :","zzzz :"); + } + if(!comment.startsWith("Axis error")){ + storelist.add(comment); + } + else{ + comment=comment.replaceFirst("Axis","Axis("+yax.getId().toString()+")"); + } + } + for (Property property : yax.getProperties()) { + j++; + String tempstr = property.getName().toString(); + tempstr = tempstr.concat(" : "); + if (property.getFocus() != null){ + tempstr = tempstr.concat(property.getFocus().toString()); + } + if (property.getLowValue() != null && property.getHighValue() != null && + property.getLowValue().equals(property.getHighValue())) { + tempstr = tempstr.concat(property.getLowValue().toString()); + } else { + if(property.getFocus() != null) { tempstr = tempstr.concat(" ("); } + tempstr = tempstr.concat(property.getLowValue().toString()); + tempstr = tempstr.concat("-"); + tempstr = tempstr.concat(property.getHighValue().toString()); + if(property.getFocus() != null) { tempstr = tempstr.concat(")"); } + } + if (property.getUnit().toString().equals("") || !property.getUnit().isDimensionless()) { + tempstr = tempstr.concat(" "); + tempstr = tempstr.concat(property.getUnit().toString()); + } + if(tempstr.startsWith(". :")){ + tempstr = tempstr.substring(3); + } + storelist.add(tempstr); + } + Collections.sort(storelist); + int ij=-1; + for (int i=0; i<storelist.size(); i++){ + if(i>0 && storelist.get(i).equals(storelist.get(i-1))){ nprop -= 1;} + else{ + ij += 1; + store[numy][ij] = storelist.get(i).replaceFirst("sqrts","SQRT(S)").replaceFirst("zzzz :",""); + } + } + } + + for (int i=0; i<nprop+ncomm; i++){ + if(store[0][i] != null){ + s.append(" <datafield tag=\"653\" ind1=\"\" ind2=\"\">\n"); + s.append(" <subfield code=\"k\"/>\n") ; + s.append(" <subfield code=\"v\"/>\n") ; + for (int nx=0; nx<=nxmax-1; nx++){ + s.append(" <subfield code=\"c\">"+nx+"</subfield>\n"); + } + s.append(" </datafield>\n"); + Boolean same = true; + for (int ny=0; ny<nymax-1; ny++){ + if(store[ny][i] !=null && store[ny+1][i] != null){ if(!store[ny][i].equals(store[ny+1][i])){ same = false; } } + else { same = false; } + } + int ii=0; + if(store[0][i].indexOf(":")>0) { ii = store[0][i].indexOf(":"); } + if(same){ + s.append(" <datafield tag=\"653\" ind1=\"\" ind2=\"\">\n"); + if(ii>0 && store[0][i].substring(0,ii-1).equals("RE")){ + s.append(" <subfield code=\"r\">") ; + } else { + s.append(" <subfield code=\"k\">") ; + s.append(store[0][i].substring(0,ii)); + s.append("</subfield>\n") ; + s.append(" <subfield code=\"v\">") ; + } + s.append(store[0][i].substring(ii+1).replaceAll(">",">").replaceAll("<","<")); + s.append("</subfield>\n") ; + for (int nycol=nxmax; nycol<=nxmax+nymax-1; nycol++){s.append(" <subfield code=\"c\">"+nycol+"</subfield>\n");} + s.append(" </datafield>\n"); + } + else{ + for (int ny=0; ny<nymax; ny++){ + s.append(" <datafield tag=\"653\" ind1=\"\" ind2=\"\">\n"); + if(store[ny][i].startsWith("RE :")){ + s.append(" <subfield code=\"r\">") ; + } else { + s.append(" <subfield code=\"k\">") ; + if(ny==0){ s.append(store[ny][i].substring(0,ii));} + s.append("</subfield>\n") ; + s.append(" <subfield code=\"v\">") ; + } + s.append(store[ny][i].substring(ii+1).replaceAll(">",">").replaceAll("<","<")); + s.append("</subfield>\n") ; + int nycol=ny+1; + s.append(" <subfield code=\"c\">"+nycol+"</subfield>\n"); + s.append(" </datafield>\n"); + } + } + } + } + + s.append(" <datafield tag=\"710\" ind1=\"\" ind2=\"\">\n"); + s.append(" <subfield code=\"g\">"+p.getInformalName()+" Collaboration</subfield>\n"); + s.append(" </datafield>\n"); + + s.append(" <datafield tag=\"786\" ind1=\"\" ind2=\"\">\n"); + s.append(" <subfield code=\"w\">"+p.getInspireId()+"</subfield>\n"); + s.append(" <subfield code=\"q\">"+"1"+"</subfield>\n"); + s.append(" <subfield code=\"r\">"+p.getArchive().replaceAll("ARXIV","arXiv")+"</subfield>\n"); + s.append(" <subfield code=\"h\">"+location+"</subfield>\n"); + s.append(" </datafield>\n"); + + s.append(" <datafield tag=\"856\" ind1=\"4\" ind2=\"\">\n"); + s.append(" <subfield code=\"u\">\n"); + s.append(" http://inspirehep.net/record/"+id+"/files/Data.txt\n"); + s.append(" </subfield>\n"); + s.append(" <subfield code=\"y\">"+"data extracted from the table"+"</subfield>\n"); + s.append(" </datafield>\n"); + + int n=-1; + for (XAxis xax : ds.getXAxes()){ + n++; + s.append(" <datafield tag=\"910\" ind1=\"\" ind2=\"\">\n"); + s.append(" <subfield code=\"n\">"+n+"</subfield>\n"); + s.append(" <subfield code=\"d\">"+xax.getHeader()+"</subfield>\n"); + s.append(" </datafield>\n"); + } + for (YAxis yax : ds.getYAxes()){ + n++; + s.append(" <datafield tag=\"910\" ind1=\"\" ind2=\"\">\n"); + s.append(" <subfield code=\"n\">"+n+"</subfield>\n"); + s.append(" <subfield code=\"d\">"+yax.getHeader()+"</subfield>\n"); + s.append(" </datafield>\n"); + } + s.append(" <datafield tag=\"911\" ind1=\"\" ind2=\"\">\n"); + s.append(" <subfield code=\"x\">"+nxmax+"</subfield>\n"); + s.append(" <subfield code=\"y\">"+nymax+"</subfield>\n"); + s.append(" </datafield>\n"); + + s.append(" </record>\n"); + return s.toString(); + } + + public static String _numbersXML(Dataset ds){ + StringBuffer s = new StringBuffer(); + + // first deal with any dataser errors + for (DatasetError de : ds.getErrors()){ + s.append(" <error>"); + s.append("Additional systematic error: "); + if(de.getPlus() != 0.0 || de.getPlus() != 0.0) { + if(de.getPlus().equals(de.getMinus())){ + s.append("+-"); + s.append(" " + de.getPlus()); + } else{ + s.append("+" + de.getPlus() + ",-" + de.getMinus() + " "); + } + s.append(de.getNormType().toSymbol()+ " "); + } + s.append("(" +de.getComment() + ")"); + s.append("</error>\n"); + } + + if ( ds.getNumPoints() != 0) { + int npoints = ds.getNumPoints(); + int binIdMax = ds.getMaxPointId(); + int nx = ds.getXAxes().size(); // number of xaxes in the dataset + int ny = ds.getYAxes().size(); // number of yaxes in the dataset + int sumzero = 0; + int allzero = 0; + boolean verbatim = false; + for (String comment : ds.getComments()) { + if(comment.startsWith("VERBATIM")){verbatim = true;} + } + for (int ip = 1 ; ip <= npoints; ip++) { + s.append(" <row id=\""+ip+"\">\n"); +///////////////////////////////////////////////////////////////////////////////////////////// + for (int ix = 1 ; ix <= nx; ix++) { + s.append(" <column type=\"x\" colspan=\"1\">"); + XAxis x = ds.getXAxis(ix); + // Work out x-axis formatting + /// @todo Do this formatting stuff properly with SignificantFigures as for y-values + int lwidth = 0; + int hwidth = 0; + int fwidth = 0; + // Format as "<focus> (bin: <low>--<high>)" if showmean is true == + // at least one bin has a focus which significantly differs from the mean + // ..force it to be true so that we always output the mean + boolean asymmfocus = false; + boolean haswidth = false; + for (Bin b : x.getBins()) { + if (b.getLowValue() != null && b.getHighValue() != null && b.getFocus() != null) { + double diff = b.getFocus() - (b.getLowValue() + b.getHighValue()) / 2.0; + if (Math.abs(diff/b.getFocus()) > 1E-6) { asymmfocus = true; } + } + if (b.getLowValue() != null && b.getHighValue() != null){ + double diff = b.getHighValue() - b.getLowValue(); + double mean = (b.getHighValue() + b.getLowValue())/2.0; + if(Math.abs(diff/mean) > 1E-6){ haswidth = true; } + } + if (b.getLowValue() != null && b.getLowValue().toString().length() > lwidth) { + lwidth = b.getLowValue().toString().length(); + } + if (b.getHighValue() != null && b.getHighValue().toString().length() > hwidth) { + hwidth = b.getHighValue().toString().length(); + } + if (b.getFocus() != null && b.getFocus().toString().length() > fwidth) { + fwidth = b.getFocus().toString().length(); + } + } + Double f = null; + Double high = null; + Double low = null; + Bin b = x.getBin(ip); + if (b != null) { + if (b.getDescription() == null){ + if (b.getRelation() == Relation.EQUALS) { + low = b.getLowValue(); + high = b.getHighValue(); + f = b.getFocus(); + int lsd = 0; + int msd = 0; + if(f != null) { + SignificantFigures valx = new SignificantFigures(f); + lsd = valx.getLSD(); + msd = valx.getMSD(); + if(msd == 0) msd = 1; + if(f.toString().indexOf("9999999") > 0) { + int pos1 = f.toString().indexOf("9999999"); + int pos2 = f.toString().indexOf("."); + lsd = pos2-pos1+1; + } + else if(f.toString().indexOf("0000000") > 0) { + int pos1 = f.toString().indexOf("0000000"); + int pos2 = f.toString().indexOf("."); + lsd = pos2-pos1+1; + } + if(b.getFocusLength() != null){ + lsd = -b.getFocusLength(); + } + } + String f_val = Formats.forms(f,fwidth, msd,-lsd); + if(lsd == -1 && f_val.endsWith(".0") && b.getFocusLength() == null){ f_val = f_val.substring(0,f_val.length()-2); } + if (low != null && high != null) { + SignificantFigures vall = new SignificantFigures(low); + lsd = vall.getLSD(); + msd = vall.getMSD(); + if(msd == 0) msd = 1; + if(low.toString().indexOf("9999999") > 0) { + int pos1 = low.toString().indexOf("9999999"); + int pos2 = low.toString().indexOf("."); + lsd = pos2-pos1+1; + } + else if(low.toString().indexOf("0000000") > 0) { + int pos1 = low.toString().indexOf("0000000"); + int pos2 = low.toString().indexOf("."); + lsd = pos2-pos1+1; + } + if(b.getLowValueLength() != null){ + lsd = -b.getLowValueLength(); + } + String f_low = Formats.forms(low,lwidth,msd,-lsd); + SignificantFigures valh = new SignificantFigures(high); + lsd = valh.getLSD(); + msd = valh.getMSD(); + if(msd == 0) msd = 1; + if(high.toString().indexOf("9999999") > 0) { + int pos1 = high.toString().indexOf("9999999"); + int pos2 = high.toString().indexOf("."); + lsd = pos2-pos1+1; + } + else if(high.toString().indexOf("0000000") > 0) { + int pos1 = high.toString().indexOf("0000000"); + int pos2 = high.toString().indexOf("."); + lsd = pos2-pos1+1; + } + if(b.getHighValueLength() != null){ + lsd = -b.getHighValueLength(); + } + String f_high = Formats.forms(high,hwidth,msd,-lsd); + Double width = high - low; + if (f != null && asymmfocus && haswidth) { + s.append(f_val); + s.append(" (bin: "); + s.append(f_low + " - " + f_high); + s.append(")"); + } else if (f==0.0 && haswidth==false || (f != null && Math.abs(width/f) < 1E-6)) { + s.append(f_val); + } else { + s.append(f_low + " - " + f_high); + } + } else { + s.append(f_val); + } + } else { + s.append(b.getRelation().toString().replaceAll(">",">").replaceAll("<","<")); + if (b.getLowValue() != null) { + s.append(b.getLowValue().toString()); + } else if (b.getHighValue() != null) { + s.append(b.getHighValue().toString()); + } + } + } else { + s.append(b.getDescription()); + } + } + s.append("</column>\n"); + } +/////////////////////////////////////////////////////////////////////// + for (int iy = 1; iy <= ny; ++iy) { + s.append(" <column type=\"y\" colspan=\"1\">"); + YAxis y = ds.getYAxis(iy); + Point pt2 = y.getPoint(ip); + try { + /// DON'T MODIFY ANY OF THIS AT THE MOMENT: I'M MOVING THE FORMATTING TO AN EXTERNAL CLASS! + if (pt2.getRelation() != Relation.EQUALS) { s.append(pt2.getRelation().toString().replaceAll(">",">").replaceAll("<","<")); } + SignificantFigures val = new SignificantFigures(pt2.getValue()); + int lsd = val.getLSD(); + allzero += 1; + if (pt2.getValue().toString().endsWith(".0")) sumzero += 1; + if (pt2.getValue().toString().endsWith("0") || pt2.getValue().toString().indexOf("0E") > 0) {lsd += 1;} + + if(pt2.getValueLength() != null && pt2.getValueLength() > 0) { + lsd = -pt2.getValueLength(); + } + else{ + if (pt2.getRelation() == Relation.EQUALS) { + for (PointError e : pt2.getErrors()) { + int lsd1 = 0; + int lsd2 = 0; + if (e.getNormType() != ErrorNorm.PCT) { + SignificantFigures vale1 = new SignificantFigures(e.getPlus()); + lsd1 = vale1.getLSD(); + if(!e.getPlus().toString().endsWith("0") && e.getPlus().toString().indexOf("0E") <= 0){ + lsd = Math.min(lsd, lsd1); + } + SignificantFigures vale2 = new SignificantFigures(e.getMinus()); + lsd2 = vale2.getLSD(); + if(!e.getMinus().toString().endsWith("0") && e.getMinus().toString().indexOf("0E") <= 0){ + lsd = Math.min(lsd, lsd2); + } + } + } + } + } + int msd = val.getMSD(); + if (msd <= 0) msd = 1; + if (verbatim) { s.append(pt2.getValue().toString().replaceAll("E","e")); } + else{ s.append(Formats.forms(pt2.getValue(),msd, -lsd)); } + if (pt2.getRelation() == Relation.EQUALS) { + for (PointError e : pt2.getErrors()) { + String stype = e.getSourceType().toShortString(); + SignificantFigures vale = null; + if (e.getPlusLength() != null) { lsd = -e.getPlusLength(); } + if (e.isSymmetric()) { + vale = new SignificantFigures(e.getPlus()); + s.append(" +- "); // chosen to allow breaks *before* the +- sign only + if (e.getNormType() == ErrorNorm.PCT) { lsd = vale.getLSD(); } + msd = vale.getMSD(); + if (msd <= 0) msd = 1; + if (verbatim) { s.append(e.getPlus().toString().replaceAll("E","e")); } + else {s.append(Formats.forms(e.getPlus(), msd, -lsd)); } + s.append(e.getNormType().toSymbol()); + } else if (e.isSymmetricReversed()) { + vale = new SignificantFigures(e.getPlus()); + s.append(" -+ "); // chosen to allow breaks *before* the +- sign only + if (e.getNormType() == ErrorNorm.PCT) { lsd = vale.getLSD(); } + msd = vale.getMSD(); + if (msd <= 0) msd = 1; + if (verbatim) { s.append(e.getMinus().toString().replaceAll("E","e")); } + s.append(Formats.forms(e.getMinus(), msd, -lsd)); + s.append(e.getNormType().toSymbol()); + } else { + vale = new SignificantFigures(e.getPlus()); + if(e.getPlus()<0.0){ s.append("");} + else if(e.getPlus() > e.getMinus()) { s.append(" +"); } + else { s.append(" +"); } + if (e.getNormType() == ErrorNorm.PCT) { lsd = vale.getLSD(); } + msd = vale.getMSD(); + if (msd <= 0) msd = 1; + if (verbatim) { s.append(e.getPlus().toString().replaceAll("E","e")); } + else { s.append(Formats.forms(e.getPlus(), msd, -lsd)); } + s.append(e.getNormType().toSymbol()); + vale = new SignificantFigures(e.getMinus()); + if(e.getPlus()<0.0){ s.append(",+");} + else if(e.getPlus() > e.getMinus()) { s.append(",-");} + else { s.append(",-"); } + if (e.getNormType() == ErrorNorm.PCT) { lsd = vale.getLSD(); } + msd = vale.getMSD(); + if (msd <= 0) msd = 1; + if (verbatim) { s.append(e.getMinus().toString().replaceAll("E","e")); } + else{ s.append(Formats.forms(e.getMinus(), msd, -lsd)); } + s.append(e.getNormType().toSymbol()); + } + if(!stype.equals("?")) s.append(" (" + stype + ") "); + } + } + for (PointError e : pt2.getErrors()){ + if(!e.getComment().equals("")) { s.append(" (" + e.getComment() + ")"); } + } + } + catch (Exception e) { + s.append("-"); + } + s.append("</column>\n"); + } +//////////////////////////////////////////////////////// + s.append(" </row>\n"); + } + } + return s.toString(); + } + +}
More information about the HepData-svn mailing list |