Web scraping a page after it's loaded its data

Web scraping a page after it's loaded its data - python

Trying to collect data on book price fluctuations for a school project. I'm using Python to scrape from a book buyback aggregator (in this case, bookscouter), but I find that since the site has to load in the data, grabbing the source code through the urllib2 package gives me the source code from before the data is loaded. How do I pull from after the data is loaded?
Example: http://bookscouter.com/prices.php?isbn=9788498383621&searchbutton=Sell

You cannot this with Python only. You need a JavaScript engine API like PhantomJS
With Phantom, will be very easy to setup the web scraping of all the page contents, static and dynamic JavaScript contents (like Ajax calls results in your case). Infact you can register page event handlers to your page parser like (this is a node.js + phantom.js example)
/*
* Register Page Handlers as functions
{
onLoadStarted : onLoadStarted,
onLoadFinished: onLoadFinished,
onError : onError,
onResourceRequested : onResourceRequested,
onResourceReceived : onResourceReceived,
onNavigationRequested : onNavigationRequested,
onResourceError : onResourceError
}
*/
registerHandlers : function(page, handlers) {
if(handlers.onLoadStarted) page.set('onLoadStarted',handlers.onLoadStarted)
if(handlers.onLoadFinished) page.set('onLoadFinished',handlers.onLoadFinished)
if(handlers.resourceError) page.set('onResourceError', handlers.resourceError)
if(handlers.onResourceRequested) page.set('onResourceRequested',handlers.onResourceRequested)
if(handlers.onResourceReceived) page.set('onResourceReceived',handlers.onResourceReceived)
if(handlers.onNavigationRequested) page.set('onNavigationRequested',handlers.onNavigationRequested)
if(handlers.onError) page.set('onError',handlers.onError)
}
At this point you have full control of what is going on and when in the page you have to download like:
var onResourceError = function(resourceError) {
var errorReason = resourceError.errorString;
var errorPageUrl = resourceError.url;
}
var onResourceRequested = function (request) {
var msg = ' request: ' + JSON.stringify(request, undefined, 4);
};
var onResourceReceived = function(response) {
var msg = ' id: ' + response.id + ', stage: "' + response.stage + '", response: ' + JSON.stringify(response);
};
var onNavigationRequested = function(url, type, willNavigate, main) {
var msg = ' destination_url: ' + url;
msg += ' type (cause): ' + type;
msg += ' will navigate: ' + willNavigate;
msg += ' from page\'s main frame: ' + main;
};
page.onResourceRequested(
function(requestData, request) {
//request.abort()
//request.changeUrl(url)
//request.setHeader(key,value)
var msg = ' request: ' + JSON.stringify(request, undefined, 4);
//console.log( msg )
},
function(requestData) {
//console.log(requestData.url)
})
PageHelper.registerHandlers(page,
{
onLoadStarted : onLoadStarted,
onLoadFinished: onLoadFinished,
onError : null, // onError THIS HANDLER CRASHES PHANTOM-NODE
onResourceRequested : null, // MUST BE ON PAGE OBJECT
onResourceReceived : onResourceReceived,
onNavigationRequested : onNavigationRequested,
onResourceError : onResourceError
});
As you can see you can define you page handlers and take control of the flow and so of the resources loaded on that page. So you can be sure that all data are ready and set, before you take the whole page source like:
var Parser = {
parse : function(page) {
var onSuccess = function (page) { // page loaded
var pageContents=page.evaluate(function() {
return document.body.innerText;
});
}
var onError = function (page,elapsed) { // error
}
page.evaluate(function(func) {
return func(document);
}, function(dom) {
return true;
});
}
} // Parser
Here you can see the whole page contents loaded in the onSuccess callback:
var pageContents=page.evaluate(function() {
return document.body.innerText;
});
The page comes from Phantomjs directly like in the following snippet:
phantom.create(function (ph) {
ph.createPage(function (page) {
Parser.parse(page)
})
},options)
Of course this to give you and idea of what you can do with node.js + Phantomjs, that are super powerful when combined together.
You can run phantomjs in a Python env, calling it like
try:
output = ''
for result in runProcess([self.runProcess,
self.runScript,
self.jobId,
self.protocol,
self.hostname,
self.queryString]):
output += '' + result
print output
except Exception as e:
print e
print(traceback.format_exc())
where you use subprocess Popen to execute the binary:
def runProcess(exe):
p = subprocess.Popen(exe, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
while(True):
retcode = p.poll() #returns None while subprocess is running
line = p.stdout.readline()
yield line
if(retcode is not None):
break
Of course the process to run is node.js in this case
self.runProcess='node'
with the args you need as params.

The challenge is reading the data once its been rendered by a web browser, which will require some extra tricks to do. If you can see if the site has a pre-rendered version* or an API.
This article (linked from the Web archive) has a pretty good breakdown of what you'll need to do. It can be summed up however as:
Pick a good python-webkit renderer (in the case of the article PyQT)
Use a windowing widget to fetch and render the page
Fetch the rendered HTML from the widget
Parse this HTML as normal using a library like lXML or BeautifulSoup.
* Minor rant - the idea of having to hope for a pre-rendered version ofwhat should be a static webpage angers me.

Related

Error 308 Permanent Redirect in X++/C# code

I need to send an API request to SMS API.
My Python code is working but my X++/C# code is not working.
I tried with Postman, it is working as well.
Here's my Python code:
import requests
headers = {"Accept":"application/json"}
data = {"AppSid":"#############YxLtXtaN###",
"SenderID":"####23423#####",
"Body":"This is a test message.",
"Recipient":"###45645######",
"responseType":"JSON",
"CorrelationID":"",
"baseEncode":"true",
"statusCallback":"sent",
"async":"false"}
r = requests.post('http://myapi/rest/SMS/messages', auth=('user#domain.com', 'password'), headers=headers,
data=data)
Here's my X++/C# code:
class Class1
{
public static void main(Args _args)
{
str destinationUrl = 'myapi', requestXml, responseXml;
System.Net.HttpWebRequest request;
System.Net.HttpWebResponse response;
CLRObject clrObj;
System.Byte[] bytes;
System.Text.Encoding utf8;
System.IO.Stream requestStream, responseStream;
System.IO.StreamReader streamReader;
System.Exception ex;
System.Net.WebHeaderCollection httpHeader;
str byteStr;
System.Byte[] byteArray;
System.IO.Stream stream;
System.IO.Stream dataStream;
byteStr = strfmt('%1:%2', "user#domain.com", "password");
requestXml = " {\"AppSid\":\"###########\", \"SenderID\":\"########-AD\", \"Body\":\"This is a test message from ## from X++ Coding Language..\", \"Recipient\":\"######\", \"responseType\":\"JSON\", \"CorrelationID\":\"\", \"baseEncode\":\"true\", \"statusCallback\":\"sent\", \"async\":\"false\"}";
try
{
new InteropPermission(InteropKind::ClrInterop).assert();
httpHeader = new System.Net.WebHeaderCollection();
clrObj = System.Net.WebRequest::Create(destinationUrl);
request = clrObj;
utf8 = System.Text.Encoding::get_UTF8();
bytes = utf8.GetBytes(requestXml);
request.set_KeepAlive(true);
request.set_ContentType("application/xml");
request.AllowAutoRedirect=true;
utf8 = System.Text.Encoding::get_UTF8();
byteArray = utf8.GetBytes(byteStr);
byteStr = System.Convert::ToBase64String(byteArray);
httpHeader.Add("Authorization", 'Basic ' + byteStr);
request.set_ContentType("text/xml; encoding='utf-8'");
request.set_ContentLength(bytes.get_Length());
request.set_Method("POST");
request.set_Headers(httpHeader);
requestStream = request.GetRequestStream();
requestStream.Write(bytes, 0, bytes.get_Length());
response = request.GetResponse();
responseStream = response.GetResponseStream();
streamReader = new System.IO.StreamReader(responseStream);
responseXml = streamReader.ReadToEnd();
info(responseXml);
}
catch (Exception::CLRError)
{
//bp deviation documented
ex = CLRInterop::getLastException().GetBaseException();
error(ex.get_Message());
}
requestStream.Close();
streamReader.Close();
responseStream.Close();
response.Close();
}
}
I'm getting this error:
error code : The remote server returned an error: (308) Permanent
Redirect.

I made two changes.
removed the Authorization by commenting it out
//httpHeader.Add("Authorization", 'Basic ' + byteStr);
Made it HTTPS instead of HTTP. I could see the Location: HTTPS in the exception Response object in Visual Studio.
These 2 things resolved the issue for me.

I don't know about x++, but you could try explicitly allowing redirect on your webRequest object.
webRequest = System.Net.WebRequest::Create('http://myapi/rest/SMS/messages') as System.Net.HttpWebRequest;
webRequest.AllowAutoRedirect = true;
webRequest.MaximumAutomaticRedirections = 1; //Set value according to your requirements

Socket.io client won't display to all users

document.addEventListener('DOMContentLoaded', () => {
var socket = io.connect(location.protocol + '//' + document.domain + ':' + location.port);
socket.on('message', data => {
const p = document.createElement('p');
const span_username = document.createElement('span');
const span_timestamp = document.createElement('span');
const br = document.createElement('br');
span_username.innerHTML = data.username;
span_timestamp.innerHTML = data.time_stamp;
p.innerHTML = span_username.outerHTML + br.outerHTML + data.msg
+ br.outerHTML + span_timestamp.outerHTML;
document.querySelector('#display-message-section').append(p);
});
// Send message
document.querySelector("#send_message").onclick = () => {
socket.send({'msg':document.querySelector('#user_message').value,
'username': username});
}
});
Here is my full client side. I want to get it to show all sent messages to all users, including myself. When I opened another tab and then typed something, the first type won't show what my second tab sent. It is sent in the server but not displayed in the browser.
#socketio.on('message')
def message(data):
print(f"\n\n{data}\n\n")
send({'msg': data['msg'], 'username': data['username'], 'time_stamp':strftime('%b-%d %I:%M%p', localtime())})
This is my minimal server.

Try adding broadcast=True to the arguments on your send. By default, socketio will only emit to the original caller of the socket function.

How to store python output recieved in node js?

I'm invoking a python script from node js. The python script retrieves data from a REST API and stores it in a dataframe and then there's a search function based on user input. I'm confused as to what variable type does python send the data to node js in? I've tried to convert into a string but in node js it says it is an unresolved variable type. Here's the code:
r = requests.get(url)
data = r.json()
nested = json.loads(r.text)
nested_full = json_normalize(nested)
req_data= json_normalize(nested,record_path ='items')
search = req_data.get(["name", "id"," ])
#search.head(10)
filter = sys.argv[1:]
print(filter)
input = filter[0]
print(input)
result = search[search["requestor_name"].str.contains(input)]
result = result.to_String(index=false)
response = '```' + str(result) + '```'
print(response)
sys.stdout.flush()
Here's the node js program that invokes the above python script. How do i store the output in a format which i can pass to another function in node?
var input = 'robert';
var childProcess = require("child_process").spawn('python', ['./search.py', input], {stdio: 'inherit'})
const stream = require('stream');
const format = require('string-format')
childProcess.on('data', function(data){
process.stdout.write("python script output",data)
result += String(data);
console.log("Here it is", data);
});
childProcess.on('close', function(code) {
if ( code === 1 ){
process.stderr.write("error occured",code);
process.exit(1);
}
else{
process.stdout.write('done');
}
});

According to the docs:
childProcess.stdout.on('data', (data) => {
console.log(`stdout: ${data}`);
});

Getting 404 Not Found nginx 1.6.2 on Paytm

I have followed the usage as per link below:
https://www.npmjs.com/package/react-native-paytm
This is my checksum generation code:
import Checksum
# initialize a dictionary
paytmParams = dict()
# put checksum parameters in Dictionary
paytmParams["MID"] = "*****************"
paytmParams["ORDER_ID"] = 'ORD001'
paytmParams["CUST_ID"] = 'CUST001'
paytmParams["INDUSTRY_TYPE_ID"] = 'Retail'
paytmParams["CHANNEL_ID"] = 'WAP'
paytmParams["TXN_AMOUNT"] = '1.00'
paytmParams["WEBSITE"] = 'WEBSTAGING'
paytmParams["EMAIL"] = '**************'
paytmParams["MOBILE_NO"] = '****************'
paytmParams["CALLBACK_URL"] = 'https://securegw-
stage.paytm.in/theia/paytmCallback?ORDER_ID=ORD001'
# Find your Merchant Key in your Paytm Dashboard at
https://dashboard.paytm.com/next/apikeys
checksum = Checksum.generate_checksum(paytmParams, "*************")
Code for react native:
import paytm from 'react-native-paytm';
import { Platform, DeviceEventEmitter, NativeModules,NativeEventEmitter} from 'react-native';
const paytmConfig = {
MID: '************',
WEBSITE: 'WEBSTAGING',
CHANNEL_ID: 'WAP',
INDUSTRY_TYPE_ID: 'Retail',
CALLBACK_URL: 'https://securegw-
stage.paytm.in/theia/paytmCallback?ORDER_ID=ORD001'
}
onPayTmResponse(response) {
// Process Response
// response.response in case of iOS
// reponse in case of Android
console.log(response);
}
runTransaction() {
const callbackUrl = 'https://securegw-
stage.paytm.in/theia/paytmCallback?ORDER_ID=ORD001'
const details = {
mode: 'Staging', // 'Staging' or 'Production'
mid: paytmConfig.MID,
industryType: paytmConfig.INDUSTRY_TYPE_ID,
website: paytmConfig.WEBSITE,
channel: paytmConfig.CHANNEL_ID,
amount: '1.00', // String
orderId: 'ORD001', // String
custId: 'CUST001', // String
email: '*****************', // String
phone: '***********', // S
checksumhash: '***********************************************', //From your server using PayTM Checksum Utility
callback: callbackUrl
};
paytm.startPayment(details);
}
The issue is that i cant even spot the error point here as there is no console. I could use some direction here. Need this for staging right now

in my cash use this npm #philly25/react-native-paytm and solved this error
are you sure onPaytmResponse is a paytm listener
like this:
componentWillMount() {
Paytm.addListener(Paytm.Events.PAYTM_RESPONSE, this.onPayTmResponse)
};
componentWillUnmount() {
Paytm.removeListener(Paytm.Events.PAYTM_RESPONSE, this.onPayTmResponse);
};

How to receive data through websockets in python

I'm trying to retrieve data programmatically through websockets and am failing due to my limited knowledge around this. On visiting the site at https://www.tradingview.com/chart/?symbol=ASX:RIO I notice one of the websocket messages being sent out is ~m~60~m~{"m":"quote_fast_symbols","p":["qs_p089dyse9tcu","ASX:RIO"]}
My code is as follows:
from websocket import create_connection
import json
ws = create_connection("wss://data.tradingview.com/socket.io/websocket?from=chart%2Fg0l68xay%2F&date=2019_05_27-12_19")
ws.send(json.dumps({"m":"quote_fast_symbols","p"["qs_p089dyse9tcu","ASX:RIO"]}))
result = ws.recv()
print(result)
ws.close()
Result of the print:
~m~302~m~{"session_id":"<0.25981.2547>_nyc2-charts-3-webchart-5#nyc2-compute-3_x","timestamp":1558976872,"release":"registry:5000/tvbs_release/webchart:release_201-106","studies_metadata_hash":"888cd442d24cef23a176f3b4584ebf48285fc1cd","protocol":"json","javastudies":"javastudies-3.44_955","auth_scheme_vsn":2}
I get this result no matter what message I send out, out of the almost multitude of messages that seem to be sent out. I was hoping one of the messages sent back will be the prices info for the low and highs for RIO. Is there other steps I should include to get this data? I understand there might be some form of authorisation needed but I dont know the workflow.

Yes, there is much more to setup and it needs to be done in order. The following example written in Node.js will subscribe to the BINANCE:BTCUSDT real time data and fetch historical 5000 bars on the daily chart.
Ensure you have proper value of the origin field set in header section before connecting. Otherwise your connection request will be rejected by the proxy. I most common ws there is no way to do this. Use faye-websocket instead
const WebSocket = require('faye-websocket')
const ws = new WebSocket.Client('wss://data.tradingview.com/socket.io/websocket', [], {
headers: { 'Origin': 'https://data.tradingview.com' }
});
After connecting you need to setup your data stream. I don't know if all of this commands needs to be performed. This probably can be shrink even more but it works. Basically what you need to do is to create new quote and chart sessions and within these sessions request stream of the data of the prior resolved symbol.
ws.on('open', () => {
const quote_session = 'qs_' + getRandomToken()
const chart_session = 'cs_' + getRandomToken()
const symbol = 'BINANCE:BTCUSDT'
const timeframe = '1D'
const bars = 5000
sendMsg(ws, "set_auth_token", ["unauthorized_user_token"])
sendMsg(ws, "chart_create_session", [chart_session, ""])
sendMsg(ws, "quote_create_session", [quote_session])
sendMsg(ws, "quote_set_fields", [quote_session,"ch","chp","current_session","description","local_description","language","exchange","fractional","is_tradable","lp","lp_time","minmov","minmove2","original_name","pricescale","pro_name","short_name","type","update_mode","volume","currency_code","rchp","rtc"])
sendMsg(ws, "quote_add_symbols",[quote_session, symbol, {"flags":['force_permission']}])
sendMsg(ws, "quote_fast_symbols", [quote_session, symbol])
sendMsg(ws, "resolve_symbol", [chart_session,"symbol_1","={\"symbol\":\""+symbol+"\",\"adjustment\":\"splits\",\"session\":\"extended\"}"])
sendMsg(ws, "create_series", [chart_session, "s1", "s1", "symbol_1", timeframe, bars])
});
ws.on('message', (msg) => { console.log(`RX: ${msg.data}`) })
And finally implementation of the helper methods
const getRandomToken = (stringLength=12) => {
characters = 'abcdefghijklmnopqrstuvwxyz0123456789'
const charactersLength = characters.length;
let result = ''
for ( var i = 0; i < stringLength; i++ ) {
result += characters.charAt(Math.floor(Math.random() * charactersLength))
}
return result
}
const createMsg = (msg_name, paramsList) => {
const msg_str = JSON.stringify({ m: msg_name, p: paramsList })
return `~m~${msg_str.length}~m~${msg_str}`
}
const sendMsg = (ws, msg_name, paramsList) => {
const msg = createMsg(msg_name, paramsList)
console.log(`TX: ${msg}`)
ws.send(createMsg(msg_name, paramsList))
}

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Web scraping a page after it's loaded its data - python

Related

Error 308 Permanent Redirect in X++/C# code

Socket.io client won't display to all users

How to store python output recieved in node js?

Getting 404 Not Found nginx 1.6.2 on Paytm

How to receive data through websockets in python

Categories

Resources