I have an xml file that have paragraphs element, sentence elements and annotation sub element under sentences. I would like to read these annotation elements and extract the content to write them to a new xml file like:
<sentence>
<Date></Date>
<Person></Person>
<NumberDate></NumberDate>
<Location></Location>
<etc></etc>
</sentence>
In my code, I parse the xml file and read the annotations but am only able to print to console. I cant figure out how to continue and how to export to a new xml file.
Here is my code:
package domparserxml;
import java.io.File;
//package domparserxml;
import java.io.IOException;
import java.io.PrintStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
public class DomParserXml {
public static void main(String[] args) {
// Tap into the xml
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
try {
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse("Chrono.xml"); //This is my input xml file
NodeList paragraphList = doc.getElementsByTagName("paragraph");//getting the paragraph tags
for (int i=0;i<paragraphList.getLength();i++) {
Node p = paragraphList.item(i);//getting the paragraphs
if (p.getNodeType()==Node.ELEMENT_NODE) {//if the datatype is Node element than we can handle it
Element paragraph = (Element) p;
paragraph.getAttribute("id"); //get the paragraph id
paragraph.getAttribute("date");//get the paragraph date
NodeList sentenceList = paragraph.getChildNodes();//getting the sentence childnodes of the paragraph element
for(int j=0;j<sentenceList.getLength();j++) {
Node s = sentenceList.item(j);
if(s.getNodeType()==Node.ELEMENT_NODE) {
Element sentence = (Element) s;
//sentence.getAttribute("id"); //dont need it now
NodeList annotationList = sentence.getChildNodes();//the annotation tags or nodes are childnodes of the sentence element
int len = annotationList.getLength(); //to make it shorter and reusable
System.out.println(""); //added these two just to add spaces in between sentences
//System.out.println("");
for(int a=0;a<len;a++) { //here i am using 'len' i defined above.
Node anno = annotationList.item(a);
if(anno.getNodeType()==Node.ELEMENT_NODE) {
Element annotation = (Element) anno;
if(a ==1){ //if it is the first sentence of the paragraph, print all these below:
//PrintStream myconsole = new PrintStream(new File("C:\\Users\\ngwak\\Applications\\eclipse\\workfolder\\results.xml"));
//System.setOut(myconsole);
//myconsole.print("paragraph-id:" + paragraph.getAttribute("id") + ";" + "paragraph-date:" + paragraph.getAttribute("date") + ";" + "senteid:" + sentence.getAttribute("id") + ";" + annotation.getTagName() + ":" + annotation.getTextContent() + ";");
System.out.print("paragraph-id:" + paragraph.getAttribute("id") + ";" + "paragraph-date:" + paragraph.getAttribute("date") + ";" + "senteid:" + sentence.getAttribute("id") + ";" + annotation.getTagName() + ":" + annotation.getTextContent() + ";");
}
if (a>1){ // if there is more after the first sentence, don't write paragraph, id etc. again, just write what is new..
//PrintStream myconsole = new PrintStream(new File("C:\\Users\\ngwak\\Applications\\eclipse\\workfolder\\results.xml"));
System.out.print(annotation.getTagName() + ":" + annotation.getTextContent() + ";");
//myconsole.print("paragraph-id:" + paragraph.getAttribute("id") + " " + "paragraph-date:" + paragraph.getAttribute("date") + " " + "senteid:" + sentence.getAttribute("id") + " " + annotation.getTagName() + ":" + annotation.getTextContent() + " ");
}
}
}
}
}
}
}
} catch (ParserConfigurationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
Can somebody please help me.
Thanks.