У меня есть следующая логика синтаксического анализа xml.gz, которая перебирает ~ 2000 файлов xml.gz (в виде потоков), которые, как мне кажется, содержат около 25 000 узлов или «составных частей», и я массово загружаю их в mongodb 1000 за один раз.
const zlib = require('zlib')
const fs = require('fs')
const sax = require('sax')
const MongoClient = require('mongodb').MongoClient
// Connection URL
const url = 'mongodb://localhost:27017'
// Database Name
const dbName = 'foo'
const paths = fs.readdirSync('./')
MongoClient.connect(url, function(err, client) {
console.log("Connected successfully to server")
const db = client.db(dbName)
const compounds = db.collection('compounds')
compounds.createIndex(
{ "visited": 1 },
null,
function(err, results) {
console.log(results)
perform(compounds, paths)
}
)
})
function perform(compounds, paths) {
var i = 0
var log = fs.createWriteStream('log.csv')
log.write('path\n')
next()
function next() {
var name = paths[i++]
if (!name) return console.log('done')
doit(log, compounds, name, function(){
setImmediate(next)
})
}
}
function doit(log, collection, name, done) {
console.log('parsing...', name)
log.write(name + '\n')
const gzip = zlib.createGunzip()
const xmlStream = fs.createReadStream(name)
const out = fs.createWriteStream('foo.xml')
const saxStream = sax.createStream(false, {})
saxStream.on('error', function(e){
console.error('error!', e)
this._parser.error = null
this._parser.resume()
})
var compounds = []
var compound
var bondIndex = 0
var atomIndex = 0
var field
var label
var name
var point
var type
var op
var insert = 0
function write(end) {
if (compounds.length === 1000 || end) {
console.log('writing', insert++)
var inserts = compounds
compounds = []
collection.bulkWrite(inserts.map(compound => {
compound.visited = false
return {
updateOne: {
filter: { ncbiId: compound.ncbiId },
upsert: true,
update: { $set: compound }
}
}
}),
{},
function(){
})
}
}
saxStream.on('text', function(text){
switch (field) {
case 'value':
var value = text
if (label.match(/IUPAC Name/i)) {
if (name.match(/Systematic/i)) {
compound.iupacSystematicName = value
} else if (name.match(/Preferred/i)) {
compound.iupacPreferredName = value
} else if (name.match(/Markup/i)) {
compound.iupacMarkupName = value
} else if (name.match(/CAS.like Style/i)) {
compound.iupacCASName = value
} else if (name.match(/Allowed/i)) {
compound.iupacAllowedName = value
}
} else if (label.match(/Fingerprint/i)) {
compound.fingerprint = value
} else if (label.match(/Compound Complexity/i)) {
compound.complexity = parseFloat(value)
} else if (label.match(/InChIKey/)) {
compound.inchiKey = value
} else if (label.match(/InChI/)) {
compound.inchi = value
} else if (label.match(/Log P/)) {
compound.logP = parseFloat(value)
} else if (label.match(/Mass/)) {
compound.mass = parseFloat(value)
} else if (label.match(/Molecular Formula/)) {
compound.formula = value
} else if (label.match(/Molecular Weight/)) {
compound.weight = value
} else if (label.match(/SMILES/)) {
if (name.match(/Canonical/)) {
compound.smiles = value
}
} else {
// console.log(label, name)
}
if (name) {
if (name.match(/Hydrogen Bond Acceptor/)) {
compound.hydrogenBondAcceptor = parseFloat(value)
} else if (name.match(/Hydrogen Bond Donor/)) {
compound.hydrogenBondDonor = parseFloat(value)
} else if (name.match(/Rotatable Bond/)) {
compound.rotatable = parseFloat(value)
} else if (name.match(/Polar Surface Area/)) {
compound.polarSurfaceArea = parseFloat(value)
}
}
break
case null:
case undefined:
break
default:
switch (type) {
case 'int':
point[field] = parseInt(text)
break
case 'float':
point[field] = parseFloat(text)
break
default:
point[field] = text
break
}
break
}
if (op == 'getAtom') {
point = compound.atoms[parseInt(text) - 1]
} else if (op == 'setBondStart') {
point.start = parseInt(text) - 1
} else if (op == 'setBondEnd') {
point.end = parseInt(text) - 1
} else if (op == 'setBondOrder') {
point.order = parseInt(text)
} else if (op == 'getLabel') {
label = text
} else if (op == 'getName') {
name = text
}
})
saxStream.on('opentag', function(node){
switch (node.name) {
case 'PC-COMPOUNDTYPE_ID_CID':
point = compound = {
atoms: [],
bonds: []
}
field = 'ncbiId'
type = 'string'
break
case 'PC-ELEMENT':
point = {}
compound.atoms.push(point)
field = 'element'
type = 'int'
break
case 'PC-ATOMINT_AID':
op = 'getAtom'
break
case 'PC-ATOMINT_VALUE':
field = 'charge'
type = 'int'
break
case 'PC-BONDS_AID1':
bondIndex = 0
break
case 'PC-BONDS_AID1_E':
point = {}
op = 'setBondStart'
compound.bonds.push(point)
break
case 'PC-BONDS_AID2':
bondIndex = 0
break
case 'PC-BONDS_AID2_E':
op = 'setBondEnd'
point = compound.bonds[bondIndex++]
break
case 'PC-BONDS_ORDER':
bondIndex = 0
break
case 'PC-BONDTYPE':
op = 'setBondOrder'
point = compound.bonds[bondIndex++]
break
case 'PC-CONFORMER_X':
atomIndex = 0
break
case 'PC-CONFORMER_X_E':
point = compound.atoms[atomIndex]
field = 'x'
type = 'float'
break
case 'PC-CONFORMER_Y':
atomIndex = 0
break
case 'PC-CONFORMER_Y_E':
point = compound.atoms[atomIndex]
field = 'y'
type = 'float'
break
case 'PC-COMPOUND_CHARGE':
point = compound
field = 'charge'
type = 'float'
break
case 'PC-URN_LABEL':
op = 'getLabel'
break
case 'PC-URN_NAME':
op = 'getName'
break
case 'PC-INFODATA_VALUE_IVAL':
case 'PC-INFODATA_VALUE_BINARY':
case 'PC-INFODATA_VALUE_SVAL':
case 'PC-INFODATA_VALUE_IVEC':
case 'PC-INFODATA_VALUE_FVAL':
case 'PC-INFODATA_VALUE_BVAL':
case 'PC-INFODATA_VALUE_BVEC':
case 'PC-INFODATA_VALUE_SLIST':
case 'PC-INFODATA_VALUE_FVEC':
case 'PC-INFODATA_VALUE_BITLIST':
case 'PC-INFODATA_VALUE_DATE':
field = 'value'
break
}
})
saxStream.on('end', function(){
write(true)
done()
})
saxStream.on('closetag', function(node){
type = null
field = null
op = null
switch (node) {
case 'PC-COMPOUND':
compounds.push(compound)
write(false)
compound = null
point = null
break
case 'PC-INFODATA':
name = null
label = null
value = null
break
}
})
var i = 0
xmlStream.pipe(gzip).pipe(saxStream)
}
По сути, это сводится к следующему:
const zlib = require('zlib')
const fs = require('fs')
const sax = require('sax')
const MongoClient = require('mongodb').MongoClient
// Connection URL
const url = 'mongodb://localhost:27017'
// Database Name
const dbName = 'foo'
const paths = fs.readdirSync('.')
MongoClient.connect(url, function(err, client) {
console.log("Connected successfully to server")
const db = client.db(dbName)
const compounds = db.collection('compounds')
compounds.createIndex(
{ "visited": 1 },
null,
function(err, results) {
console.log(results)
perform(compounds, paths)
}
)
})
function perform(compounds, paths) {
var i = 0
var log = fs.createWriteStream('log.csv')
log.write('path\n')
next()
function next() {
var name = paths[i++]
if (!name) return console.log('done')
doit(log, compounds, name, function(){
setImmediate(next)
})
}
}
function doit(log, collection, name, done) {
console.log('parsing...', name)
log.write(name + '\n')
const gzip = zlib.createGunzip()
const xmlStream = fs.createReadStream(name)
const out = fs.createWriteStream('foo.xml')
const saxStream = sax.createStream(false, {})
saxStream.on('error', function(e){
console.error('error!', e)
this._parser.error = null
this._parser.resume()
})
var compounds = []
var compound
var bondIndex = 0
var atomIndex = 0
var field
var label
var name
var point
var type
var op
var insert = 0
function write(end) {
// every 1000 I write
if (compounds.length === 1000 || end) {
var inserts = compounds
compounds = []
collection.bulkWrite(inserts.map(compound => {
compound.visited = false
return {
updateOne: {
filter: { ncbiId: compound.ncbiId },
upsert: true,
update: { $set: compound }
}
}
}),
{},
function(){
})
}
}
saxStream.on('text', function(text){
})
saxStream.on('opentag', function(node){
})
saxStream.on('end', function(){
write(true)
done()
})
saxStream.on('closetag', function(node){
compounds.push(compound)
write(false)
})
var i = 0
xmlStream.pipe(gzip).pipe(saxStream)
}
Тем не менее, я столкнулся с перегрузкой памяти, где он показывает ошибки памяти v8:
FATAL ERROR: Ineffective mark-compacts near heap limit Allocation failed - JavaScript heap out of memory
Writing Node.js report to file: report.20190519.135156.12689.001.json
Node.js report completed
1: 0x100063e23 node::Abort() [/usr/local/bin/node]
2: 0x100064491 node::OnFatalError(char const*, char const*) [/usr/local/bin/node]
3: 0x10017e24f v8::Utils::ReportOOMFailure(v8::internal::Isolate*, char const*, bool) [/usr/local/bin/node]
4: 0x10017e1f0 v8::internal::V8::FatalProcessOutOfMemory(v8::internal::Isolate*, char const*, bool) [/usr/local/bin/node]
5: 0x100440458 v8::internal::Heap::UpdateSurvivalStatistics(int) [/usr/local/bin/node]
6: 0x100441e75 v8::internal::Heap::CheckIneffectiveMarkCompact(unsigned long, double) [/usr/local/bin/node]
7: 0x10043f7ed v8::internal::Heap::PerformGarbageCollection(v8::internal::GarbageCollector, v8::GCCallbackFlags) [/usr/local/bin/node]
8: 0x10043e5e3 v8::internal::Heap::CollectGarbage(v8::internal::AllocationSpace, v8::internal::GarbageCollectionReason, v8::GCCallbackFlags) [/usr/local/bin/node]
9: 0x1004464b1 v8::internal::Heap::AllocateRawWithLightRetry(int, v8::internal::AllocationSpace, v8::internal::AllocationAlignment) [/usr/local/bin/node]
10: 0x100446500 v8::internal::Heap::AllocateRawWithRetryOrFail(int, v8::internal::AllocationSpace, v8::internal::AllocationAlignment) [/usr/local/bin/node]
11: 0x1004268c7 v8::internal::Factory::NewFillerObject(int, bool, v8::internal::AllocationSpace) [/usr/local/bin/node]
12: 0x100603e6e v8::internal::Runtime_AllocateInNewSpace(int, v8::internal::Object**, v8::internal::Isolate*) [/usr/local/bin/node]
13: 0x34613a14fc7d
Abort trap: 6
Вопрос в том, что я делаю неправильно при потоковой передаче этих XML-файлов и их анализе. Что я могу сделать по-другому, чтобы не столкнуться с ошибкой памяти. Может быть, это потому, что я не закрываю потоки после использования? Я не знаю.