mongoose vs pymongo驱动程序编写(insertMany)测试

2024-05-15 12:33:33 发布

您现在位置:Python中文网/ 问答频道 /正文

使用python进行基准测试

import json
import os
from datetime import datetime

import pymongo
from bson import ObjectId

jsons = []

DB_URL = os.environ.get('DB_URL')
FILE_PATH = os.environ.get('FILE_PATH')

client = pymongo.MongoClient(DB_URL)
database = client[client.get_database().name]


collection_name = 'test_collection'

database[collection_name].drop()

database[collection_name].create_index([('p.k', pymongo.ASCENDING),
                                        ('p.v', pymongo.ASCENDING)],
                                        background=True)

def writeJsons():
    global jsons
    if not jsons:
        return
    print('Start {}'.format(len(jsons)))
    ts = datetime.utcnow()
    database[collection_name].insert_many(jsons)
    print('{} {} seconds'.format(len(jsons), datetime.utcnow() - ts))
    jsons = []

with open(FILE_PATH) as fl:
   for line in fl.readlines():
       j = json.loads(line)
       j['_id'] = j['_id']['$oid']
       for o in j['p']:
           if (isinstance(o['v'], dict)):
               o['v'] = o['v']['$numberLong']
       jsons.append(j)
       if len(jsons) == 100000:
          writeJsons()

writeJsons()

带节点的基准测试

import mongoose from 'mongoose'

import process from 'process'
import events from 'events'
import fs from 'fs'
import readline from 'readline'

const DB_URL = process.env.DB_URL
const FILE_PATH = process.env.FILE_PATH

mongoose.connect(DB_URL, {
  useFindAndModify: false,
  useUnifiedTopology: true,
  useNewUrlParser: true
})

let db = mongoose.connection
db.on('error', err => console.log('Failed to connect at %s: %s', DB_URL, err))

const DataSchema = new mongoose.Schema(
  {},
  { validateBeforeSave: false, versionKey: false, strict: false }
)
DataSchema.index({ 'p.k': 1, 'p.v': 1 }, { background: true })
const collectionName = 'test_collection'
const DataModel = mongoose.model(
  collectionName,
  DataSchema,
  collectionName
)

const fileStream = fs.createReadStream(FILE_PATH)

const rl = readline.createInterface({
  input: fileStream,
  crlfDelay: Infinity
})

let jsons = []

let promise = Promise.resolve()

async function writeJsons() {
  if (!jsons.length) {
    return
  }
  console.log(`Start ${jsons.length}`)
  const startTime = Date.now()
  const startTimeArray = process.hrtime()
  await DataModel.insertMany(jsons.splice(0, 100000), {
    bypassDocumentValidation: true
  })
  const diff = Date.now() - startTime
  console.log(
    `${jsons.length} ${diff / 1000} seconds ${process.hrtime(startTimeArray)}`
  )
  jsons = []
}

rl.on('line', line => {
  try {
    const json = JSON.parse(line.toString())
    json._id = mongoose.Types.ObjectId(json._id.$oid)
    for (const o of json.p) {
      if (typeof o.v === 'object') {
        o.v = o.v.$numberLong
      }
    }
    jsons.push(json)
    if (jsons.length === 100000) {
      rl.pause()
    }
  } catch (err) {
    console.log(line.toString(), err)
  }
})

rl.on('pause', async () => {
  promise = writeJsons().then(() => rl.resume())
})

rl.on('close', async () => {
  await promise
  await writeJsons()
})

db.once('open', async () => {
  console.log(`Successfully connected at ${DB_URL}`)
  try {
    await mongoose.connection.dropCollection(collectionName)
  } catch (err) {}
})

输入文件是导出的集合,大小为613 MB,>;1.7 M记录。你知道吗

每个文档看起来像

{_id:...,
 p: [
     {k:'somekey', v: 'somevalue'},
     ...
     ]
}

使用mongodb版本4.0。你知道吗

结果非常可靠,pymongo3.8的python2比mongoose5.7的nodejs10快2-3倍。有类似的经历、观点吗?实际上,nodejs应该比python快一点。你知道吗


Tags: pathfromimportjsonurldbifline