Communicate Node.js with Python and fetch data delay - python

I'm trying to program web crawler.
I have server.js / crawling.js / dataCrawler.py
When I call crawlData that is defined in crawling.js at server.js, the method I defined in crawling.js using spawn for executing the dataCrawler.py gets called.
I need data in server.js, but executing dataCrawler.py takes a while So I cannot get proper data but null or undefined.
Do you have any solution ? or Anyone who has same issue?
My codes are below. (I don't put these perfectly. Just reference for structure)
//server.js
var crawler = require("./crawling")
var resultArr = crawler.crawlData();
console.log('nodeserver:', resultArr)
//crawling.js
exports.crawlData = ()=>{
var dataArr = [];
var temp;
var py = spawn('python', ['dataCrawler.py']);
var data = [totalUrl, gubun];
var dataFromPy = null;
py.stdout.on('data', function(result){
var dataArr = encoding.convert(result, 'utf-8')
dataArr = JSON.parse(encoding.convert(result, 'utf-8'));
py.stdout.on('end', function(){
temp = dataArr
});
});
py.stdin.write(JSON.stringify(data));
py.stdin.end();
return temp;
}
//dataCrawler.py
def crawling(url, gubun, page_count):
idx = 0
result = []
jsonData = {}
for i in range(1, page_count + 1):
....
crawling code
....
return result
def main():
lines = sys.stdin.readlines()
paraFromServer = json.loads(lines[0])
url = paraFromServer[0]
gubun = paraFromServer[1]
result = crawling(url, gubun, page_count)
print(result)
main()

You didn't account for the asynchronous nature of javascript. What you have to do is, pass in a callback method to crawlData method, which will be invoked once scraping is done.
exports.crawlData = (cb)=>{
....
py.stdout.on('data', function(result){
var dataArr = encoding.convert(result, 'utf-8')
dataArr = JSON.parse(encoding.convert(result, 'utf-8'));
py.stdout.on('end', function(){
cb(dataArr); // ideally the pattern is cb(error, data)
});
});
...
So server.js becomes:
var crawler = require("./crawling")
crawler.crawlData((data) => {
console.log(data);
// Do whatever you want to do with the data.
});
Callbacks can cause Callback hell. Try exploring promises or async / await.

Alternatively you can use spawnSync if running in parallel isn't a concern
exports.crawlData = () => {
const result = spawnSync('python', ['dataCrawler.py'], {
input: JSON.stringify([totalUrl, gubun])
});
return JSON.parse(encoding.convert(result, 'utf-8'));
}

Related

Python save embed base64 pdf string into file

I have html document embeded with pdf document in base64 encoded format. I like to extract the string and save it as pdf file. using below code to save it as pdf file.
but its on opening in adobe reader, saying invalid format. looking to fix this issue.
I think pdf file encoded using Javascript encodeURIComponent function. need to convert using Python.
sample embed tag
<embed type="application/pdf" src="data:application/pdf;base64,JVBERi0xLjQKJeLjz9MKMSAwIG9iago8PC9D">
Code
import base64
def decode_b64():
b64 = "JVBERi0xLjQKJeLjz9MKMSAwIG9iago8PC9D"
buffer = BytesIO.BytesIO()
content = base64.b64decode(b64)
buffer.write(content)
with open(Path(Path.home(), 'Downloads', 'mytest.pdf'), "wb") as f:
f.write(buffer.getvalue())
if __name__ == "__main__":
decode_b64()
=== Update 1:
found the way to convert using JavaScript: It will be nice if we can port this code to Python.
const {readFileSync, writeFile, promises: fsPromises} = require('fs');
var data=readFileSync("pdf-file.html", 'utf-8')
var DOMParser = require('xmldom').DOMParser;
var parser = new DOMParser();
const virtualDoc = parser.parseFromString(data, 'text/html');
var elem = virtualDoc.getElementsByTagName('embed')[0];
for (var i = 0; i < elem.attributes.length; i++) {
var attrib = elem.attributes[i];
if (attrib.specified) {
if( attrib.name == "src") {
var result =attrib.value
result=result.replace('data:application/pdf;base64,','');
let buff = Buffer.from(decodeURIComponent(result), 'base64');
writeFile('pdf-file.pdf', buff, err => {
if (err) {
console.error(err);
}
});
}
}
}
This is a situation that you should have been able to chase down yourself. I wasn't 100% sure how Javascript encoded those two characters, so I wrote up a simple HTML page:
<script>
var s = "abcde++defgh//";
alert(encodeURIComponent(s));
</script>
When I ran that page, the result was "abcde%2B%2Bdefgh%2F%2F", and that is all the information you need to fix up those strings.
def decode_b64():
b64 = "JVBERi0xLjQKJeLjz9MKMSAwIG9iago8PC9D......"
b64 = b64.replace('%2B','+').replace('%2F','/')
content = base64.b64decode(b64)
with open(Path(Path.home(), 'Downloads', 'mytest.pdf'), "wb") as f:
f.write(content)

How to get progress of file upload using requests.post() if the file was just uploaded via form-data? (Not sure whether this is a streaming process)

Suppose I'm running some kind of web service with python & flask aiming to provide file upload. It should be noted that my service is only a kind of transfer station, which means I'll "repost" the file elsewhere, say, a massive centralized file storage, as soon as I receive a file from frontend form-data. The flask code looks like this:
#admin.route('/data', methods=['POST'])
def data_upload():
if 'file' in request.files:
f = request.files['file']
try:
r = Requests.post(DataConfig.server + '/' + '/upload.php', files={'file': (f.filename, f)})
return {'e': 0, 'msg': r.content.decode('utf-8')}
except RequestException:
return ReturnCode.ERR_NETWORK_FAILURE
return ReturnCode.ERR_BAD_DATA
It is not allowed that the frontend directly send file or data to "upload.php", since the server key should not be exposed to users.
Now I've got two problems that confuse me.
Is this process streaming or streaming-ready? I mean, whether my python server will receive the file and store it somewhere temporarily, or the file will be repost in chunks, like iter_block? If streaming is not readily enabled, how can I do so to enable large-file uploading user experience?
Is it possible to provide the user (frontend, browser) information about the progress of uploading?
For question 2, there exists some posts indicating that tqdm will help, but tqdm is a cli tool and printing anything in the python server's terminal makes no sense, and document of tqdm doesn't show me any obvious reference about their apis to get a percentage. What's more, I think that maybe sending this percentage to frontend may require some xhr techniques, which I believe my code can't deal with.
PART 1 OF MY ANSWER : THE JAVASCRIPT
async function ubox_send_chucks (divid) {
var chunstart = 0;
var chunend = 0;
var chunksize = 1024 * 1024 * 9;
var newblobchun_id = parseInt(Math.floor(Math.random() * 100000000) + 1);
fi = $("#"+divid).files;
stubuploads = fi.length;
for (var i = 0; i < fi.length; i++) {
var thismediaby_id = (newblobchun_id + i + 3);
$("#progressx").append("<div id=\'loaderz_"+thismediaby_id+"_message\' class=\'padding3 margin3\' >FILE "+i+" (PREP)</div>");
$("#phpx").append("<div id=\'phpz_"+thismediaby_id+"_message\' class=\'padding3 margin3\' >FILE "+i+" (PREP)</div>");
}
for (var i = 0; i < fi.length; i++) {
if ( fi[i].size > 0 ) {
var numberofchunks = Math.ceil( fi[i].size / chunksize );
var thismediaby_id = (newblobchun_id + i + 3);
logx("FILE "+i+" -- size: "+fi[i].size+" name: "+fi[i].name+" n of chunks to send: "+numberofchunks );
// SEND EACH CHUNKS
for (var c = 0; c <= numberofchunks; c++) {
chunstart = (c * chunksize); chunend = chunstart + chunksize + 1; if ( chunend > fi[i].size ){ chunend = fi[i].size; }
var thischunk = fi[i].slice(chunstart, chunend);
var thismediaby_name = thismediaby_id+"--chunk_"+c;
console.log("FILE "+i+" send chunk: "+c+" start: "+chunstart+" end: "+chunend);
upload_percent = ( c / numberofchunks ) * 100;
$("#loaderz_"+thismediaby_id+"_message").html("FILE "+i+" : " + Math.round(upload_percent) + " %");
var fd = new FormData();
fd.append("data", thischunk, encodeURIComponent(thismediaby_name));
fd.append("thismedia_id", encodeURIComponent(thismediaby_id));
fd.append("thismedia_name", encodeURIComponent(fi[i].name));
fd.append("numberofchunks", encodeURIComponent(numberofchunks));
fd.append("thischunk_number", encodeURIComponent(c));
fd.append("thisfilex", encodeURIComponent(i));
fd.append("thissession", encodeURIComponent(thissession));
fd.append("howmanyfiles", encodeURIComponent(fi.length));
var pcache = (Math.floor(Math.random() * 100000000) + 1);
await fetch("/templates/tests_ubox_slice/slice_receiver.php?pcache="+pcache, { method: "POST", body: fd })
.then(function (response) { return response.text(); })
.then(function (html) { $("#php_message").html(html); })
}
// WHEN ALL CHUNKS SENT TRIGGER A RECOMBINE (SAFER)
// AJAX FUNCTION HERE https://stubs.s3.filebase.com/media/stubs/202111100393/recookies.js
var combinex = [];
combinex["thismedia_id"] = encodeURIComponent(thismediaby_id);
combinex["thissession"] = encodeURIComponent(thissession);
combinex["thismedia_name"] = encodeURIComponent(fi[i].name);
stubajax("combiner","/templates/tests_ubox_slice/slice_combiner.php?pcache="+pcache,combinex);
}
}
}
PART II PHP RECEIVER
function clean () { ....stuff to make user input more secure like rawurldecode, html_entity_decode, stripslashes, etc }
$pcache = clean( $_REQUEST['pcache'] ?? '' ) ;
$thismedia_id = clean( $_REQUEST['thismedia_id'] ?? '' ) ;
$thismedia_name = clean( $_REQUEST['thismedia_name'] ?? '' );
$thismedia_ext = pathinfo($exif_file)['extension'] ?? '';
$numberofchunks = clean( $_REQUEST['numberofchunks'] ?? '' ) ;
$thischunk_number = clean( $_REQUEST['thischunk_number'] ?? '' ) ;
$thisfilex = clean( $_REQUEST['thisfilex'] ?? '' ) ;
$howmanyfiles = clean( $_REQUEST['howmanyfiles'] ?? '' ) ;
$thissession = clean( $_REQUEST['thissession'] ?? '' ) ;
if ( $pcache != '' ) {
// DEV
// var_dump(['thismedia_id'=>$thismedia_id,'thismedia_name'=>$thismedia_name,'numberofchunks'=>$numberofchunks,'thischunk_number'=>$thischunk_number,'thisfilex'=>$thisfilex]);
// WHERE TO SAVE CHUNKS
$received_chunk_dir = '/temporary_path_here_you_receiver_chunks/'.$thissession.'/'.$thismedia_id.'/';
$received_chunk_path = $received_chunk_dir.$thismedia_id.'_chunk_'.$thischunk_number;
// IF DIRECTORY NOT THERE CREATE IT
if ( !file_exists($received_chunk_dir) ) { shell_exec('mkdir -p "'.$received_chunk_dir.'"'); }
// MOVE_UPLOADED_FILE
foreach ($_FILES as $thisfilekey => $thisfilechun) {
if ( isset($thisfilechun['name']) && isset($thisfilechun['tmp_name']) ) {
if ( filesize($thisfilechun['tmp_name']) > 1 ) { move_uploaded_file($thisfilechun['tmp_name'],$received_chunk_path); }
}
}
// ECHO PERCENT PROGRESSION FOR THAT FILE
echo '<script>$("#phpz_'.$thismedia_id.'_message").html("FILE '.$thisfilex.' : received chunk number '.$thischunk_number.' of '.$numberofchunks.' chunks");</script>';
}
PART III PHP COMBINER
$pcache = clean( $_REQUEST['pcache'] ?? '' ) ;
$thismedia_id = clean( $_REQUEST['thismedia_id'] ?? '' ) ;
$thismedia_name = accentx(clean( $_REQUEST['thismedia_name'] ?? '' ),'unlock');
$thismedia_ext = exiftensionx(['exif_file'=>$thismedia_name]);
$numberofchunks = clean( $_REQUEST['numberofchunks'] ?? '' ) ;
$thischunk_number = clean( $_REQUEST['thischunk_number'] ?? '' ) ;
$thisfilex = clean( $_REQUEST['thisfilex'] ?? '' ) ;
$howmanyfiles = clean( $_REQUEST['howmanyfiles'] ?? '' ) ;
$thissession = clean( $_REQUEST['thissession'] ?? '' ) ;
if ( $thissession != '' ) {
// PATH
$received_chunk_dir = '/temporary_path_here_you_receiver_chunks/'.$thissession;
$received_final_path = $received_chunk_dir.'/'.$thismedia_id.'/'.$thismedia_id.'.'.$thismedia_ext;
// GET OF SORTED FILES -V because chunk_1, chunk_2, ...., chunk_9, chunk_10, chunk_11 ==> dont want chunk_1,chunk_10,chunk_2 but chunk_1,chunk_2,...,chunk_10
$all_chunks_raw = shell_exec('ls '.$received_chunk_dir.'/'.$thismedia_id.'/* | sort -V');
if ( $all_chunks_raw != '' ) {
// GET LS OF ALL CHUNKS
$all_chunks_explo = array_filter(explode(PHP_EOL,$all_chunks_raw));
// IF ONLY 1 CHUNK JUST RENAME
if ( count($all_chunks_explo) == 1 ) { rename($all_chunks_explo[0],$received_final_path); #unlink($all_chunks_explo[0]); }
else {
// RECOMBINE ALL CHUNKS WITH FOPEN FREAD chunksize = 1024 * 1024 * 9 = 9437184 from javascript HAS TO BE THE SAME VALUE
foreach ( $all_chunks_explo as $chunkey => $chunx ){
$file = fopen($chunx, 'rb'); $buff = fread($file, 9437184); fclose($file);
$final = fopen($received_final_path, 'ab'); $write = fwrite($final, $buff); fclose($final);
}
// DELETE CHUNKS AFTER COMBINE
shell_exec('ls '.$received_chunk_dir.'/'.$thismedia_id.' -name "*_chunk_*" -delete');
}
}
// HERE YOU CAN FFMPEG, IMAGEMAGICK, ETC TO CONVERT TO WEB COMPATIBLE FORMAT
// HERE YOU CAN SEND FILES TO S3 BUCKET (services like filebase.com)
// DELETE FILE AFTER SENDING TO S3 BUCKET IF YOU NEED TO CLEAR SPACE
echo '<script>$("#phpz_'.$thismedia_id.'_message").append(" ---COMBINATION DONE---");</script>';
}
******** NOTE : because of async: it's important to WAIT for all chuncks BEFORE combining because of networks (internet) factors, chunks don't always arrive one after an other, some times, you get chunk 1, then 3 then 6 then 2. it's why a ajax call sends a signal to combiner to "tell" ok, all chuncks are sent

How to store python output recieved in node js?

I'm invoking a python script from node js. The python script retrieves data from a REST API and stores it in a dataframe and then there's a search function based on user input. I'm confused as to what variable type does python send the data to node js in? I've tried to convert into a string but in node js it says it is an unresolved variable type. Here's the code:
r = requests.get(url)
data = r.json()
nested = json.loads(r.text)
nested_full = json_normalize(nested)
req_data= json_normalize(nested,record_path ='items')
search = req_data.get(["name", "id"," ])
#search.head(10)
filter = sys.argv[1:]
print(filter)
input = filter[0]
print(input)
result = search[search["requestor_name"].str.contains(input)]
result = result.to_String(index=false)
response = '```' + str(result) + '```'
print(response)
sys.stdout.flush()
Here's the node js program that invokes the above python script. How do i store the output in a format which i can pass to another function in node?
var input = 'robert';
var childProcess = require("child_process").spawn('python', ['./search.py', input], {stdio: 'inherit'})
const stream = require('stream');
const format = require('string-format')
childProcess.on('data', function(data){
process.stdout.write("python script output",data)
result += String(data);
console.log("Here it is", data);
});
childProcess.on('close', function(code) {
if ( code === 1 ){
process.stderr.write("error occured",code);
process.exit(1);
}
else{
process.stdout.write('done');
}
});
According to the docs:
childProcess.stdout.on('data', (data) => {
console.log(`stdout: ${data}`);
});

How to receive data through websockets in python

I'm trying to retrieve data programmatically through websockets and am failing due to my limited knowledge around this. On visiting the site at https://www.tradingview.com/chart/?symbol=ASX:RIO I notice one of the websocket messages being sent out is ~m~60~m~{"m":"quote_fast_symbols","p":["qs_p089dyse9tcu","ASX:RIO"]}
My code is as follows:
from websocket import create_connection
import json
ws = create_connection("wss://data.tradingview.com/socket.io/websocket?from=chart%2Fg0l68xay%2F&date=2019_05_27-12_19")
ws.send(json.dumps({"m":"quote_fast_symbols","p"["qs_p089dyse9tcu","ASX:RIO"]}))
result = ws.recv()
print(result)
ws.close()
Result of the print:
~m~302~m~{"session_id":"<0.25981.2547>_nyc2-charts-3-webchart-5#nyc2-compute-3_x","timestamp":1558976872,"release":"registry:5000/tvbs_release/webchart:release_201-106","studies_metadata_hash":"888cd442d24cef23a176f3b4584ebf48285fc1cd","protocol":"json","javastudies":"javastudies-3.44_955","auth_scheme_vsn":2}
I get this result no matter what message I send out, out of the almost multitude of messages that seem to be sent out. I was hoping one of the messages sent back will be the prices info for the low and highs for RIO. Is there other steps I should include to get this data? I understand there might be some form of authorisation needed but I dont know the workflow.
Yes, there is much more to setup and it needs to be done in order. The following example written in Node.js will subscribe to the BINANCE:BTCUSDT real time data and fetch historical 5000 bars on the daily chart.
Ensure you have proper value of the origin field set in header section before connecting. Otherwise your connection request will be rejected by the proxy. I most common ws there is no way to do this. Use faye-websocket instead
const WebSocket = require('faye-websocket')
const ws = new WebSocket.Client('wss://data.tradingview.com/socket.io/websocket', [], {
headers: { 'Origin': 'https://data.tradingview.com' }
});
After connecting you need to setup your data stream. I don't know if all of this commands needs to be performed. This probably can be shrink even more but it works. Basically what you need to do is to create new quote and chart sessions and within these sessions request stream of the data of the prior resolved symbol.
ws.on('open', () => {
const quote_session = 'qs_' + getRandomToken()
const chart_session = 'cs_' + getRandomToken()
const symbol = 'BINANCE:BTCUSDT'
const timeframe = '1D'
const bars = 5000
sendMsg(ws, "set_auth_token", ["unauthorized_user_token"])
sendMsg(ws, "chart_create_session", [chart_session, ""])
sendMsg(ws, "quote_create_session", [quote_session])
sendMsg(ws, "quote_set_fields", [quote_session,"ch","chp","current_session","description","local_description","language","exchange","fractional","is_tradable","lp","lp_time","minmov","minmove2","original_name","pricescale","pro_name","short_name","type","update_mode","volume","currency_code","rchp","rtc"])
sendMsg(ws, "quote_add_symbols",[quote_session, symbol, {"flags":['force_permission']}])
sendMsg(ws, "quote_fast_symbols", [quote_session, symbol])
sendMsg(ws, "resolve_symbol", [chart_session,"symbol_1","={\"symbol\":\""+symbol+"\",\"adjustment\":\"splits\",\"session\":\"extended\"}"])
sendMsg(ws, "create_series", [chart_session, "s1", "s1", "symbol_1", timeframe, bars])
});
ws.on('message', (msg) => { console.log(`RX: ${msg.data}`) })
And finally implementation of the helper methods
const getRandomToken = (stringLength=12) => {
characters = 'abcdefghijklmnopqrstuvwxyz0123456789'
const charactersLength = characters.length;
let result = ''
for ( var i = 0; i < stringLength; i++ ) {
result += characters.charAt(Math.floor(Math.random() * charactersLength))
}
return result
}
const createMsg = (msg_name, paramsList) => {
const msg_str = JSON.stringify({ m: msg_name, p: paramsList })
return `~m~${msg_str.length}~m~${msg_str}`
}
const sendMsg = (ws, msg_name, paramsList) => {
const msg = createMsg(msg_name, paramsList)
console.log(`TX: ${msg}`)
ws.send(createMsg(msg_name, paramsList))
}

Web scraping a page after it's loaded its data

Trying to collect data on book price fluctuations for a school project. I'm using Python to scrape from a book buyback aggregator (in this case, bookscouter), but I find that since the site has to load in the data, grabbing the source code through the urllib2 package gives me the source code from before the data is loaded. How do I pull from after the data is loaded?
Example: http://bookscouter.com/prices.php?isbn=9788498383621&searchbutton=Sell
You cannot this with Python only. You need a JavaScript engine API like PhantomJS
With Phantom, will be very easy to setup the web scraping of all the page contents, static and dynamic JavaScript contents (like Ajax calls results in your case). Infact you can register page event handlers to your page parser like (this is a node.js + phantom.js example)
/*
* Register Page Handlers as functions
{
onLoadStarted : onLoadStarted,
onLoadFinished: onLoadFinished,
onError : onError,
onResourceRequested : onResourceRequested,
onResourceReceived : onResourceReceived,
onNavigationRequested : onNavigationRequested,
onResourceError : onResourceError
}
*/
registerHandlers : function(page, handlers) {
if(handlers.onLoadStarted) page.set('onLoadStarted',handlers.onLoadStarted)
if(handlers.onLoadFinished) page.set('onLoadFinished',handlers.onLoadFinished)
if(handlers.resourceError) page.set('onResourceError', handlers.resourceError)
if(handlers.onResourceRequested) page.set('onResourceRequested',handlers.onResourceRequested)
if(handlers.onResourceReceived) page.set('onResourceReceived',handlers.onResourceReceived)
if(handlers.onNavigationRequested) page.set('onNavigationRequested',handlers.onNavigationRequested)
if(handlers.onError) page.set('onError',handlers.onError)
}
At this point you have full control of what is going on and when in the page you have to download like:
var onResourceError = function(resourceError) {
var errorReason = resourceError.errorString;
var errorPageUrl = resourceError.url;
}
var onResourceRequested = function (request) {
var msg = ' request: ' + JSON.stringify(request, undefined, 4);
};
var onResourceReceived = function(response) {
var msg = ' id: ' + response.id + ', stage: "' + response.stage + '", response: ' + JSON.stringify(response);
};
var onNavigationRequested = function(url, type, willNavigate, main) {
var msg = ' destination_url: ' + url;
msg += ' type (cause): ' + type;
msg += ' will navigate: ' + willNavigate;
msg += ' from page\'s main frame: ' + main;
};
page.onResourceRequested(
function(requestData, request) {
//request.abort()
//request.changeUrl(url)
//request.setHeader(key,value)
var msg = ' request: ' + JSON.stringify(request, undefined, 4);
//console.log( msg )
},
function(requestData) {
//console.log(requestData.url)
})
PageHelper.registerHandlers(page,
{
onLoadStarted : onLoadStarted,
onLoadFinished: onLoadFinished,
onError : null, // onError THIS HANDLER CRASHES PHANTOM-NODE
onResourceRequested : null, // MUST BE ON PAGE OBJECT
onResourceReceived : onResourceReceived,
onNavigationRequested : onNavigationRequested,
onResourceError : onResourceError
});
As you can see you can define you page handlers and take control of the flow and so of the resources loaded on that page. So you can be sure that all data are ready and set, before you take the whole page source like:
var Parser = {
parse : function(page) {
var onSuccess = function (page) { // page loaded
var pageContents=page.evaluate(function() {
return document.body.innerText;
});
}
var onError = function (page,elapsed) { // error
}
page.evaluate(function(func) {
return func(document);
}, function(dom) {
return true;
});
}
} // Parser
Here you can see the whole page contents loaded in the onSuccess callback:
var pageContents=page.evaluate(function() {
return document.body.innerText;
});
The page comes from Phantomjs directly like in the following snippet:
phantom.create(function (ph) {
ph.createPage(function (page) {
Parser.parse(page)
})
},options)
Of course this to give you and idea of what you can do with node.js + Phantomjs, that are super powerful when combined together.
You can run phantomjs in a Python env, calling it like
try:
output = ''
for result in runProcess([self.runProcess,
self.runScript,
self.jobId,
self.protocol,
self.hostname,
self.queryString]):
output += '' + result
print output
except Exception as e:
print e
print(traceback.format_exc())
where you use subprocess Popen to execute the binary:
def runProcess(exe):
p = subprocess.Popen(exe, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
while(True):
retcode = p.poll() #returns None while subprocess is running
line = p.stdout.readline()
yield line
if(retcode is not None):
break
Of course the process to run is node.js in this case
self.runProcess='node'
with the args you need as params.
The challenge is reading the data once its been rendered by a web browser, which will require some extra tricks to do. If you can see if the site has a pre-rendered version* or an API.
This article (linked from the Web archive) has a pretty good breakdown of what you'll need to do. It can be summed up however as:
Pick a good python-webkit renderer (in the case of the article PyQT)
Use a windowing widget to fetch and render the page
Fetch the rendered HTML from the widget
Parse this HTML as normal using a library like lXML or BeautifulSoup.
* Minor rant - the idea of having to hope for a pre-rendered version ofwhat should be a static webpage angers me.

Categories

Resources