I'm trying to extract elements from XML records where each xml file has many XML records. Below is the modified code and sample xmls that I'm using.
I'm expecting an array of Strings where each element of the array is "user:id" but the result is ":". I was expecting XML.loadString to parse each file and the result would be separate XML records. Meaning if I take the two sample files as example I would end up with 4 XML records. As it is, it's two.
After adding a println(d) after getting next what I get is the entire string that represents the file which is likely why the getId and getUser functions are not returning anything.
Am I handling the load incorrectly?
import org.apache.spark.{SparkConf, SparkContext}
import scala.xml._
import scala.collection.mutable.ArrayBuffer
object Details {
def getDetails(xmlstring: String): Iterator[Node] = {
val nodes = XML.loadString(xmlstring)
nodes.toIterator
}
def getId(detail: Node): String = {
(detail \ "id").text
}
def getUser(detail: Node): String = {
(detail \ "user").text
}
def getDetailList(details: Iterator[Node]): Array[String] = {
var list = ArrayBuffer[String]()
while (details.hasNext) {
val d = details.next
val user = getUser(d)
val id = getId(d)
val formattedText = user + ":" + id
list += formattedText
}
list.toArray
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Details")
val sc: SparkContext = new SparkContext(conf)
val lines = sc.wholeTextFiles("file:///path/to/files/")
val xmlStrings = lines.map(line => line._2)
val detailsRecords = xmlStrings.map(getDetails)
val detailsList = detailsRecords.map(getDetailList)
spark.stop()
}
}
And two sample files...
test.xml
<details>
<detail>
<user>Dan</user>
<id>5555</id>
</detail>
<detail>
<user>Mike</user>
<id>6666</id>
</detail>
</details>
test2.xml
<details>
<detail>
<user>John</user>
<id>1234</id>
</detail>
<detail>
<user>Joe</user>
<id>5678</id>
</detail>
</details>