I try using sax js in order to process xml file in chunks , adding it to limited size of array , await to updating the DB , then reset the array.
The problem is that it not working asynchronously out of the box and didnt find yet the solution to update DB without loosing data
I'm using the lts version of sax js
const gunzip = zlib.createGunzip();
const xmlStream = fs.createReadStream(path).pipe(gunzip);
const saxStream = sax.createStream(true);
const BATCH_SIZE = 100;
let documents = [];
let currentElement = {};
let currentNode = null;
let isProcessing = false;
saxStream.on("opentag", function (node) {
currentNode = node.name;
if (node.name === "Item") {
currentElement = {}; // Initialize a new empty object for each item
}
});
saxStream.on("text", (text) => {
if (currentElement) {
doSomthing(text)
}
});
saxStream.on("closetag", async function (name) {
if (name === "Item" && currentElement && documents.length < BATCH_SIZE) {
documents.push(currentElement );
currentElement = {}; // Reset for the next item
if (documents.length === BATCH_SIZE && !isProcessing) {
isProcessing = true;
console.log("1. Start process batch of size", documents.length);
await insertDocuments(documents);
console.log("1. End process batch of size", documents.length);
documents = []; // Reset the documents array after processing
isProcessing = false;
}
}
currentNode = null; // Reset the current node name
});
xmlStream.pipe(saxStream);
So ofc it doesnt await to the insertDocuments and continue
so I tried to combined pausing the stream using xmlStream.pause() and xmlStream.resume() before and after the async method
if (documents.length === BATCH_SIZE && !isProcessing) {
isProcessing = true;
xmlStream.pause();
console.log("1. Start process batch of size", documents.length);
await insertDocuments(documents);
console.log("1. End process batch of size", documents.length);
documents = []; // Reset the documents array after processing
isProcessing = false;
xmlStream.resume();
}
its not good solution due to we stop the xmlStream but not the sax stream and then we loosing data .
I also tried using this._parser.close() and this._parser.resume() and couldn't find any solution .
The output of logs of this code is :
- Start process batch of size 100 processed: 100
- End process batch of size 100
- start process batch of size 17 processed: 17
- End process batch of size 17 XML parsing completed.
** in the test I used small file while the total items are 133 as we can see 16 items not processed