1

I have an xml file that have paragraphs element, sentence elements and annotation sub element under sentences. I would like to read these annotation elements and extract the content to write them to a new xml file like:

    <sentence>
      <Date></Date>
      <Person></Person>
      <NumberDate></NumberDate>
      <Location></Location>
      <etc></etc>
    </sentence>

In my code, I parse the xml file and read the annotations but am only able to print to console. I cant figure out how to continue and how to export to a new xml file.

Here is my code:

     package domparserxml;
        import java.io.File;
        //package domparserxml;
        import java.io.IOException;
        import java.io.PrintStream;
        import javax.xml.parsers.DocumentBuilder;
        import javax.xml.parsers.DocumentBuilderFactory;
        import javax.xml.parsers.ParserConfigurationException;

        import org.w3c.dom.Document;
        import org.w3c.dom.Element;
        import org.w3c.dom.Node;
        import org.w3c.dom.NodeList;
        import org.xml.sax.SAXException;

        public class DomParserXml {

            public static void main(String[] args) {
                // Tap into the xml
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

            try {
                DocumentBuilder builder = factory.newDocumentBuilder();
                Document doc = builder.parse("Chrono.xml"); //This is my input xml file
                NodeList paragraphList = doc.getElementsByTagName("paragraph");//getting the paragraph tags
                 for (int i=0;i<paragraphList.getLength();i++) {
                     Node p = paragraphList.item(i);//getting the paragraphs
                    if (p.getNodeType()==Node.ELEMENT_NODE) {//if the datatype is Node element than we can handle it
                        Element paragraph = (Element) p;
                        paragraph.getAttribute("id"); //get the paragraph id
                        paragraph.getAttribute("date");//get the paragraph date
                        NodeList sentenceList = paragraph.getChildNodes();//getting the sentence childnodes of the paragraph element
                        for(int j=0;j<sentenceList.getLength();j++) {
                            Node s = sentenceList.item(j);
                              if(s.getNodeType()==Node.ELEMENT_NODE) {
                                 Element sentence = (Element) s;
                                //sentence.getAttribute("id");  //dont need it now
                                NodeList annotationList = sentence.getChildNodes();//the annotation tags or nodes are childnodes of the sentence element
                                int len = annotationList.getLength();       //to make it shorter and reusable
                                System.out.println("");         //added these two just to add spaces in between sentences
                                //System.out.println("");
                                for(int a=0;a<len;a++) {        //here i am using 'len' i defined above. 
                                    Node anno = annotationList.item(a);
                                    if(anno.getNodeType()==Node.ELEMENT_NODE) {
                                        Element annotation = (Element) anno;
                                        if(a ==1){          //if it is the first sentence of the paragraph, print all these below:
                                            //PrintStream myconsole = new PrintStream(new File("C:\\Users\\ngwak\\Applications\\eclipse\\workfolder\\results.xml"));
                                            //System.setOut(myconsole);
                                            //myconsole.print("paragraph-id:" + paragraph.getAttribute("id") + ";" + "paragraph-date:" + paragraph.getAttribute("date")  + ";" + "senteid:" + sentence.getAttribute("id") + ";" +  annotation.getTagName() + ":" + annotation.getTextContent() + ";");
                                            System.out.print("paragraph-id:" + paragraph.getAttribute("id") + ";" + "paragraph-date:" + paragraph.getAttribute("date")  + ";" + "senteid:" + sentence.getAttribute("id") + ";" +  annotation.getTagName() + ":" + annotation.getTextContent() + ";");
                            }
                                    if (a>1){       // if there is more after the first sentence, don't write paragraph, id etc. again, just write what is new..
                                        //PrintStream myconsole = new PrintStream(new File("C:\\Users\\ngwak\\Applications\\eclipse\\workfolder\\results.xml"));
                                System.out.print(annotation.getTagName() + ":" + annotation.getTextContent() + ";");
                                        //myconsole.print("paragraph-id:" + paragraph.getAttribute("id") + " " + "paragraph-date:" + paragraph.getAttribute("date")  + " " + "senteid:" + sentence.getAttribute("id") + " " +  annotation.getTagName() + ":" + annotation.getTextContent() + " ");
                            }

                            }

                        }

                    }
                }
            }

        }
    } catch (ParserConfigurationException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (SAXException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    }

}

Can somebody please help me.

Thanks.

0

1 Answer 1

2

DOM provides many handy classes to create XML file easily. Firstly, you have to create a Document with DocumentBuilder class, define all the XML content – node, attribute with Element class. In last, use Transformer class to output the entire XML content to stream output, typically a File.

Have a look at the code, you can use this code just after you get all the values in your paragraph variable

package com.sujit;

import java.io.File;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Document;
import org.w3c.dom.Element;

public class CreateXML {

    public static void main(String[] args) {
        DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder docBuilder;
        try 
        {
            docBuilder = docFactory.newDocumentBuilder();
            // root elements
            Document doc = docBuilder.newDocument();
            Element rootElement = doc.createElement("sentence");  //root
            doc.appendChild(rootElement);

            Element date = doc.createElement("date");
            date.appendChild(doc.createTextNode(paragraph.getAttribute("date")));  // child
            rootElement.appendChild(date);

            Element person = doc.createElement("person");
            person.appendChild(doc.createTextNode(paragraph.getAttribute("person")));
            rootElement.appendChild(person);

            Element numberdate = doc.createElement("numberdate");
            numberdate.appendChild(doc.createTextNode(paragraph.getAttribute("numberDate")));
            rootElement.appendChild(numberdate);

            Element location = doc.createElement("location");
            location.appendChild(doc.createTextNode(paragraph.getAttribute("location")));
            rootElement.appendChild(location);

            TransformerFactory transformerFactory = TransformerFactory.newInstance();
            Transformer transformer = transformerFactory.newTransformer();
            DOMSource source = new DOMSource(doc);
            File file = new File("E://file.xml");
            StreamResult result = new StreamResult(file);

            transformer.transform(source, result);

            System.out.println("File saved!");          

        } 

        catch (ParserConfigurationException e) 
        {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (TransformerConfigurationException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (TransformerException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

}

Let me know if you still face any issue.

Sign up to request clarification or add additional context in comments.

7 Comments

@ sForSujit Yeah... thanks a lot, this should give me what I want but its only saving one sentence. I wrote an if loop before your code to iterate through the paragraph items but its not making any difference..
are you storing all sentences in collection?
yes, I have a big xml with multiple sentences...so for each sentence I want to get out the childnodes and their content (Date, person, location, etc) and have it as the format above..
you can create a List<sentenceDetails>,where sentenceDetails will be class with setters and getters , so once you save all sentences, you can iterate over the list and write the xml creation code
not all. I just messed up my code. When I tried to create the List<sentenceDetails> its asking to create a class and I cant figure where to do that...
|

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.