Best way to insert 10,000 JSON files (total 30GB) into MongoDB - python

It doesn't look to me that using python to combine all the json files is convenient, and the combined file would be 30G.
I am using mongoDB and nodejs. The way how I populate a sample json is:
var data = require('./data1.json')
var populateDB = function() {
db.collection('temp', function(err, collection) {
collection.insert(data, {safe:true}, function(err, result) {});
});
};
This only adds one json file. How should I populate the collection with the 10000+ json files from here? any suggestion is highly appreciated!

The easiest way would be to write a Node program that processes one JSON file and then run it multiple time from the shell:
for i in *.json; do node program.js $i; done
Your Node program would just need to access the name from process.argv instead of having it hardcoded but the logic would be the same.
If you want to do everything in node then you will have to read the directory, get all .json files, read every one of them in sequence and then run a code similar to the one you posted. If it's a one off task then you can even get away with using the "Sync" functions to simplify your code if it's a sequential task to do one thing at a time and you don't care about adding the data in parallel.

Something like this would work
npm i glob-fs mongodb async --save
const async = require('async');
const fs = require('fs');
const glob = require('glob-fs')({ gitignore: true });
const MongoClient = require('mongodb').MongoClient;
const files = './files/data*.json';
const collection = 'test';
const url = 'mongodb://localhost:27017/test';
// Connect to db
MongoClient.connect(url, function (err, db) {
if (err) {
console.log(err);
}
// Get the collection
const col = db.collection(collection);
glob.readdirPromise(files)
.then(function (f) {
return async.eachSeries(f, (item, callback) => {
fs.readFile(item, 'utf8', function (err, data) {
if (err) {
return console.log(err);
}
// Insert into mongo
col.insertMany(JSON.parse(data)).then((r) => {
console.log(r);
return callback(r);
}).catch(function (fail) {
console.log(fail)
});
});
}, err => {
console.log(err);
});
})
.then(err => {
if (err) {
db.close();
}
})
.catch(err => {
console.log(err);
});
});

Related

How can I use node.js to run a python file that runs forever?

I am wondering how to use node.js to run a python file that prints until it stops.
Right now when I run it it does not print anything, is there a way I can make it work properly?
Node.js
let {PythonShell} = require('python-shell')
var options = {
pythonOptions: ['-u']
};
PythonShell.run('main.py', options, function (err, result){
if (err) throw err;
// result is an array consisting of messages collected
//during execution of script.
if (result !== null){
console.log(result.toString());
}
});
PythonShell.run('main.py', options, function (err, result){
if (err) throw err;
// result is an array consisting of messages collected
//during execution of script.
if (result !== null){
console.log(result.toString());
}
});
A function similar to mine
main.py
num = 1
while True:
print(num)
num += 1
I'm not familiar with python-shell package but you can easily spawn a new process to run python programs by using spawn method from child_process package that comes with node.
Here is how you can use it:
const { spawn } = require("child_process");
const cmd = spawn("python3", [__dirname + "/main.py"]);
cmd.stdout.on("data", (data) => {
console.log(`stdout: ${data}`);
});
cmd.stderr.on("data", (data) => {
console.error(`stderr: ${data}`);
});
cmd.on("close", (code) => {
console.log(`child process exited with code ${code}`);
});
Read the documentation for more info [link]

Call a Python script from React with next routing and a node.js server

I am working on an ethereum application that uses react, next-routing and a basic node.js server. I want to run a python script on a specific route, claimAnalysis to be specific, in order to perform some predictions. I want the script to be executed every time I visit the claimAnalysis route.
server.js
const express = require("express")();
const next = require("next");
const app = next({
dev: process.env.NODE_ENV !== "production"
});
const routes = require("./routes");
const handler = routes.getRequestHandler(app);
app.prepare().then(() => {
const server = express.use(handler);
server.listen(3000, (err) => {
if(err) throw err;
console.log("Server ready on localhost:3000");
});
});
routes.js
const routes = require("next-routes")();
routes
.add("/insurance/new", "/insurance/new")
.add("/insurance/:address", "/insurance/show")
.add("/insurance/:address/claims", "/insurance/claims/index")
.add("/insurance/:address/claims/new", "/insurance/claims/new")
.add("/insurance/:address/claims/:id/claimAnalysis", "/insurance/claims/claimAnalysis");
module.exports = routes;
Please guide me if I can call a function from the claimAnalysis.js file that runs the python script.
If you're using Node 10+, you can use util.promisify to execute your python scriptwhich returns a Promise<{ stdout, stderr }> object. See an example below:
const util = require('util');
const exec = util.promisify(require('child_process').exec);
// function to execute python script
async function executeScript(path) {
try {
const { stdout, stderr } = await exec(`python3 ${path}`);
console.log('stdout:', stdout);
console.log('stderr:', stderr);
} catch (e) {
console.error(e);
}
}
Now let's use the function in your route:
app.get('/insurance/claims/claimAnalysis', async function (req, res) {
await executeScript();
res.send('claimAnalysis request completed');
...
})
You can use "child_process" to finish function.
You can see this example:
const path = require('path')
const {spawn} = require('child_process')
/**
* Run python myscript, pass in `-u` to not buffer console output
* #return {ChildProcess}
*/
function runScript(){
return spawn('python', [
"-u",
path.join(__dirname, 'myscript.py'),
"--foo", "some value for foo",
]);
}
const subprocess = runScript()
// print output of script
subprocess.stdout.on('data', (data) => {
console.log(`data:${data}`);
});
subprocess.stderr.on('data', (data) => {
console.log(`error:${data}`);
});
subprocess.stderr.on('close', () => {
console.log("Closed");
});

NodeJs: Get output of python-shell to send back to client

I am trying to create a website where a user can submit python code, it gets sent to my server to be executed and I send back the results to the client. Currently I am using a NodeJs server and need to run the python code from there. To do this, I am using Python-shell like so:
const runPy = async (code) => {
const options = {
mode: 'text',
pythonOptions: ['-u'],
scriptPath: path.join(__dirname, '../'),
args: [code],
};
const result = await PythonShell.run('script.py', options, (err, results) => {
if (err) throw err;
return results; <----- HOW DO I RETURN THIS
});
console.log(result.stdout);
return result;
};
I understand I can console.log() the results in the PythonShell.run() but is there a way to return the results from my runPy function to then be manipulated and sent back to the client?
It looks from the python-shell documentation that the PythonShell.run method doesn't have an async mode. So, one option is to wrap it in a promise:
const runPy = async (code) => {
const options = {
mode: 'text',
pythonOptions: ['-u'],
scriptPath: path.join(__dirname, '../'),
args: [code],
};
// wrap it in a promise, and `await` the result
const result = await new Promise((resolve, reject) => {
PythonShell.run('script.py', options, (err, results) => {
if (err) return reject(err);
return resolve(results);
});
});
console.log(result.stdout);
return result;
};

Can I use the child_process to spawn multiple python outputs on a server?

I am trying to use NodeJS to execute multiple python scripts and send the contents of those scripts to a local host. I would like to not be specific to the exact python script, but to use something similar to executing python script that uses ".py".
I have tried to run multiple processes, but the last one overwrote the former on the localhost.
Python scripts:
hellothere.py
print("hello there")
helloworld.py
print("Hello World!")
Goodbye.py
print("Goodbye!")
Pythonspawn.js
var express = require('express');
var app = express();
app.get('/name', function callName(req, res) {
var spawn = require("child_process").spawn;
var PythonProcess1 = spawn('python',["./hellothere.py"] );
var PythonProcess2 = spawn('python', ['./helloworld.py']);
var PythonProcess3 = spawn('python', ['./Goodbye.py']);
PythonProcess1.stdout.on('data', function(data) {
res.send(data.toString());
})
PythonProcess2.stdout.on('data', function(data) {
res.send(data.toString());
})
PythonProcess3.stdout.on('data', function(data) {
res.send(data.toString());
})
}
})
app.listen(1820, function() {
console.log('Server is running on port %d.', this.address().port);
})
I would like to execute any python script that uses ".py" rather than specifying the exact script I want executed. If possible, I would like to also execute the scripts if they have a different amount of arguments. (I.e if helloworld.py had two sys.arg[i] and Goodbye.py had one sys.arg[i].)
You can make use of exec() here, Here I am checking for all the .js files in the current working directory executing it all and add the result in an array and finally returning it.
const { exec } = require('child_process');
var result = [];
exec('ls | grep .js', (error, stdout, stderr) => {
if (error) {
console.error(`exec error: ${error}`);
return;
}
var s = stdout.split('\n');
s.pop();
console.log(s);
executeFiles(s);
});
function executeFiles(filenames) {
filenames.forEach((element, index) => {
exec(`node ${element}`, (error, stdout, stderr) => {
if (error) {
console.error(`exec error: ${error}`);
return;
}
console.log(stdout);
result.push(stdout.toString());
if (index === filenames.length - 1) {
console.log(result);
return result;
}
});
});
}

return JSON from python to node via spawn

I have a python script that takes two arguments; a directory and a file name.
The python script will create a JSON object from specific files in the directory provided and save it with the name being the second argument.
However if the second argument is equal to string "stream", the the JSON data is output to STDOUT.
I wrote a node script that spawns a child process to call the python script from terminal and it works as intended.
"use strict";
const spawn = require("child_process").spawn;
const command = "(path to python)";
const loc = "(path to .py script)";
const acct = process.argv[2];
const output = process.argv[3];
let callPy = spawn(command, ["erik.py", acct, output], {
cwd: loc,
stdio: "pipe"
});
callPy.stdout.on("data", (data) => {
if (data.toString() === "success") {
console.log(acct, "generated");
} else {
console.log(data.toString());
}
});
EDIT:
I have unmarked this issue as solved: after spending a bit more time trying to implement this, I have not come to a satisfactory solution that allows me to synchronously call a child process from node, signal the python script to emit JSON data, receive the data, and then send the data to the browser. I tried using a promise chain on the child process:
let child = require("child_process").spawn; // or spawnSync
let spawn = () => {
let spawned = child(command, args, options,(err, stdout, stderr) => {
if (err) { console.log(err) };
});
return spawned
};
let listen = (child) => {
child.stdout.on("data", (data) => {
console.log("PID", child.pid);
console.log("data from listen func: ", data);
return child
});
};
let kill = (child) => {
child.kill( "SIGTERM" );
}
var p = new Promise((res, e) => {
res( spawn() )
e( console.error(e) )
});
p.then(result => {
return listen(result);
})
p.then(result => {
return kill(result);
});
using spawn() the child terminates before any of the data is returned
using spawnSync() the promise chain tries (and fails) to listen on the child's io before the child is spawned
I have yet to try websockets to transmit the data but I doubt that will solve this, the promise is returning an empty object to my router function invocation before the promise chain retrieves the chunks from the python script.
Any further insight is welcome.
So you need at least two things to do this
A way to queue commands to execute with spawn
A async pattern to wait execution of a command and join processes when each executable terminates
A minimalistic examples is
var cmd = new CommandLine({
debug : true,
error : true,
delay : true });
// commandItemsArray is a list of commands list, command options, command arguments
commandItemsArray = [ ['ls','-l','./'], ['ls','-a','./'] ];
cmd.executeCommands( commandItemsArray
, function(results) {
console.log( results );
}
, function(error) {
console.log( error );
});
there are several package on npm to do both (search for node cli, command line, etc), one is this one node-commander that usese a Promise.all pattern to achieve the second task:
function PromiseAll(items, block, done, fail) {
var self=this;
var promises = [], index=0;
items.forEach(function(item) {
promises.push( function(item,i) {
return new Promise(function(resolve, reject) {
return block.apply(this,[item,index,resolve,reject]);
});
}(item,++index))
});
Promise.all(promises).then(function AcceptHandler(results) {
if(done) done( results );
}, function ErrorHandler(error) {
if(fail) fail( error );
});
} //promiseAll
I was able to resolve this issue relatively simply using websockets:
the client submits the request, which is communicated to the server via socket.IO, the request is received and the spawn event is triggered, when the chunks are finished appending a termination event is emitted which triggers killing of the child process and returning the data to the client

Categories

Resources