How to change Json data output in table format - python

import requests
from pprint import pprint
import pandas as pd
baseurl = "https://www.nseindia.com/"
url = f'https://www.nseindia.com/api/live-analysis-oi-spurts-underlyings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
'like Gecko) '
'Chrome/80.0.3987.149 Safari/537.36',
'accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'}
session = requests.Session()
request = session.get(baseurl, headers=headers, timeout=30)
cookies = dict(request.cookies)
res = session.get(url, headers=headers, timeout=30, cookies=cookies)
print(res.json())
I tried df = pd.DataFrame(res.json()) but couldn't get data in table format. How to do that Plz. Also how to select few particular columns only in data output instead of all columns.

Try this :
import json
import codecs
df = pd.DataFrame(json.loads(codecs.decode(bytes(res.text, 'utf-8'), 'utf-8-sig'))['data'])
And to select a specific columns, you can use :
mini_df = df[['symbol', 'latestOI', 'prevOI', 'changeInOI', 'avgInOI']]
>>> print(mini_df)

Related

Parsing a table with Pandas

I am trying to parse the table from https://alreits.com/screener
I have tried this:
main_url = 'https://alreits.com/screener'
r = requests.get(main_url)
df_list = pd.read_html(r.text)
df = df_list[0]
print(df)
but pandas cant find the table.
I have also tried using BeautifulSoup4 but it didnt seem to give better results.
This is the selector: #__next > div.MuiContainer-root.MuiContainer-maxWidthLg > div.MuiBox-root.jss9.Card__CardContainer-feksr6-0.fpbzHQ.ScreenerTable__CardContainer-sc-1c5wxgl-0.GRrTj > div > table > tbody
This is the full xPath: /html/body/div/div[2]/div[2]/div/table/tbody
I am trying to get the Stock symbol (under name),sector,score and market cap. The other data would be nice to have but is not necessary.
Thank You!
I found one JSON url from the dev tool. This is an easy way to extract the table instead of using selenium. Use post request to extract the data.
import requests
headers = {
'authority': 'api.alreits.com:8080',
'sec-ch-ua': '"Google Chrome";v="93", " Not;A Brand";v="99", "Chromium";v="93"',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
'sec-ch-ua-platform': '"Windows"',
'content-type': 'application/json',
'accept': '*/*',
'origin': 'https://alreits.com',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://alreits.com/',
'accept-language': 'en-US,en;q=0.9',
}
params = (
('page', '0'),
('size', '500'),
('sort', ['marketCap,desc', 'score,desc', 'ffoGrowth,desc']),
)
data = '{"filters":[]}'
response = requests.post('https://api.alreits.com:8080/api/reits/screener', headers=headers, params=params, data=data)
df = pd.DataFrame(response.json())
The code below will return the data you are looking for.
import requests
import pprint
import json
headers = {'content-type': 'application/json',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
r = requests.post(
'https://api.alreits.com:8080/api/reits/screener?page=0&size=500&sort=marketCap,desc&sort=score,desc&sort=ffoGrowth,desc',
headers=headers, data=json.dumps({'filters':[]}))
if r.status_code == 200:
pprint.pprint(r.json())
# Now you have the data - do what you want with it
else:
print(r.status_code)

Webscraping website - can't print price - api & json i think

having trouble with this website to print price, i think i'm close but getting errors.
please help, tx
"
{'statusDetails': {'state': 'FAILURE', 'errorCode': 'SYS-3003', 'correlationid': 'rrt-5636881267628447407-b-gsy1-18837-18822238-1', 'description': 'Invalid key identifier or token'}}
"
code:
import requests
import json
s = requests.Session()
url = 'https://www.bunnings.com.au/ozito-pxc-2-x-18v-cordless-line-trimmer-skin-only_p0167719'
header = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
resp = s.get(url,headers=header)
api_url = f'https://api.prod.bunnings.com.au/v1/products/0167719/fulfillment/6400/radius/100000?isToggled=true'
price_resp = s.get(api_url,headers=header).json()
print(price_resp)
#price = price_resp['data']['price']['value']
#print(price)

python handling incoming from url

i'm sending below request to URL and get the response from it
import requests
url = "http://localhost/dat.txt"
payload = {}
headers = {
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
'Sec-Fetch-Dest': 'document',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
response = requests.request("GET", url, headers=headers, data = payload)
print(response.text.encode('utf8'))
Below is the response data that I get -
mohame4|nameon#example.com|passsd!##$4|head,customer|manager,devlop
mohame3|nameon3#example.com|passsd!##$4|head,customer|manager,devlop
I do this with the data
for i in response.text:
try:
i = i.strip().split('|')
userna = i[0]
emaill = i[1]
passd = i[2]
rol1= i[3]
rol2= i[4]
except:
pass
How can I make rol1 as
this head,customer
to
rol1=['head','customer']
Simply split the string you're getting:
rol1 = i[3].split(',')
You could do this more... gracefully, though, using iterable unpacking:
username, email, password, rol1, rol2 = i.strip().split('|')
rol1 = rol1.split(',')
thanks for all helper special #ForceBru
import requests
url = "http://localhost/dat.txt"
response = requests.request("GET", url)
print(response.text)
dat = str(response.text).split('\n')
for i in dat:
i = i.strip().split('|')
print(i[3].split(","))
# TODO: write code...

Scraping with requests

what is wrong in my code, I try get the same content like in https://koleo.pl/rozklad-pkp/krakow-glowny/radom/19-03-2019_10:00/all/EIP-IC--EIC-EIP-IC-KM-REG but result is diffrent as I want to have.
import requests
from bs4 import BeautifulSoup
s = requests.Session()
s.headers.update({"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'})
response=s.get('https://koleo.pl/rozklad-pkp/krakow-glowny/radom/19-03-
2019_10:00/all/EIP-IC--EIC-EIP-IC-KM-REG')
soup=BeautifulSoup(response.text,'lxml')
print(soup.prettify())
You can use requests and pass params in to get json for the train info and prices. I haven't parsed out all the info as this is just to show you it is possible. I parse out the train ids to be able to make the subsequent requests from price info which are linked by ids to the train info
import requests
from bs4 import BeautifulSoup as bs
url = 'https://koleo.pl/pl/connections/?'
headers = {
'Accept' : 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding' : 'gzip, deflate, br',
'Accept-Language' : 'en-US,en;q=0.9',
'Connection' : 'keep-alive',
'Cookie' : '_ga=GA1.2.2048035736.1553000429; _gid=GA1.2.600745193.1553000429; _gat=1; _koleo_session=bkN4dWRrZGx0UnkyZ3hjMWpFNGhiS1I3TzhQMGNyWitvZlZ0QVRUVVVtWUFPMUwxL0hJYWJyYnlGTUdHYXNuL1N6QlhHMHlRZFM3eFZFcjRuK3ZubllmMjdSaU5CMWRBSTFOc1JRc2lDUGV0Y2NtTjRzbzZEd0laZWI1bjJoK1UrYnc5NWNzZzNJdXVtUlpnVE15QnRnPT0tLTc1YzV1Q2xoRHF4VFpWWTdWZDJXUnc9PQ%3D%3D--3b5fe9bb7b0ce5960bc5bd6a00bf405df87f8bd4',
'Host' : 'koleo.pl',
'Referer' : 'https://koleo.pl/rozklad-pkp/krakow-glowny/radom/19-03-2019_10:00/all/EIP-IC--EIC-EIP-IC-KM-REG',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
'X-CSRF-Token' : 'heag3Y5/fh0hyOfgdmSGJBmdJR3Perle2vJI0VjB81KClATLsJxFAO4SO9bY6Ag8h6IkpFieW1mtZbD4mga7ZQ==',
'X-Requested-With' : 'XMLHttpRequest'
}
params = {
'v' : 'a0dec240d8d016fbfca9b552898aba9c38fc19d5',
'query[date]' : '19-03-2019 10:00:00',
'query[start_station]' : 'krakow-glowny',
'query[end_station]': 'radom',
'query[brand_ids][]' : '29',
'query[brand_ids][]' : '28',
'query[only_direct]' : 'false',
'query[only_purchasable]': 'false'
}
with requests.Session() as s:
data= s.get(url, params = params, headers = headers).json()
print(data)
priceUrl = 'https://koleo.pl/pl/prices/{}?v=a0dec240d8d016fbfca9b552898aba9c38fc19d5'
for item in data['connections']:
r = s.get(priceUrl.format(item['id'])).json()
print(r)
You have to use selenium in order to get that dynamically generated content. And then you can parse html with BS. For example I've parsed dates:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox()
driver.get('https://koleo.pl/rozklad-pkp/krakow-glowny/radom/19-03-2019_10:00/all/EIP-IC--EIC-EIP-IC-KM-REG')
soup = BeautifulSoup(driver.page_source, 'lxml')
for div in soup.findAll("div", {"class": 'date custom-panel'}):
date = div.findAll("div", {"class": 'row'})[0].string.strip()
print(date)
Output:
wtorek, 19 marca
środa, 20 marca

Reading data from a website passing parameters

import requests
from lxml import html
from bs4 import BeautifulSoup
session_requests = requests.session()
sw_url = "https://www.southwest.com"
sw_url2 = "https://www.southwest.com/flight/select-flight.html?displayOnly=&int=HOMEQBOMAIR"
#result = session_requests.get(sw_url)
#tree = html.fromstring(result.text)
payload = {"name":"AirFormModel","origin":"MCI","destination":"DAL","departDate":"2018-02-28T06:00:00.000Z","returnDate":"2018-03-03T06:00:00.000Z","tripType":"true","priceType":"DOLLARS","adult":1,"senior":0,"promoCode":""}
#{
# 'origin': 'MCI',
# 'destination': 'DAL',
# 'departDate':'2018-02-28T06:00:00.000Z',
# 'returnDate':'2018-03-01T06:00:00.000Z',
# 'adult':'1'
#}
p = requests.post(sw_url,params=payload)
#print(p.text)
print(p.content)
p1 = requests.get(sw_url2)
soup = BeautifulSoup(p.text,'html.parser')
print(soup.find("div",{"class":"productPricing"}))
pr = soup.find_all("span",{"class":"currency_symbol"})
for tag in pr:
print(tag)
print('++++')
print(tag.next_sibling)
print(soup.find("div",{"class":"twoSegments"}))
soup = BeautifulSoup(p1.text,'html.parser')
print(soup.find("div",{"class":"productPricing"}))
pr = soup.find_all("span",{"class":"currency_symbol"})
for tag in pr:
print(tag)
print('++++')
print(tag.next_sibling)
print(soup.find("div",{"class":"twoSegments"}))
I need to retrieve prices for flights between 2 locations on specific dates. I identified the parameters by looking at the session info from inspector of the browser and included them in the post request.
I am not sure what I'm doing wrong here, but I am unable to read the data from the tags correctly. It's printing none.
Edit : 4/25/2018
I'm using the following code now, but it doesn't seem to help. Please advise.
import threading
from lxml import html
from bs4 import BeautifulSoup
import time
import datetime
import requests
def worker(oa,da,ods):
"""thread worker function"""
print (oa + ' ' + da + ' ' + ods + ' ' + str(datetime.datetime.now()))
url = "https://www.southwest.com/api/air-booking/v1/air-booking/page/air/booking/shopping"
rh = {
'accept': 'application/json,text/javascript,*/*;q=0.01',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.5',
'cache-control': 'max-age=0',
'content-length': '454',
'content-type': 'application/json',
'referer': 'https://www.southwest.com/air/booking/select.html?originationAirportCode=MCI&destinationAirportCode=LAS&returnAirportCode=&departureDate=2018-05-29&departureTimeOfDay=ALL_DAY&returnDate=&returnTimeOfDay=ALL_DAY&adultPassengersCount=1&seniorPassengersCount=0&fareType=USD&passengerType=ADULT&tripType=oneway&promoCode=&reset=true&redirectToVision=true&int=HOMEQBOMAIR&leapfrogRequest=true',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
fd = {
'returnAirport':'',
'twoWayTrip':'false',
'fareType':'DOLLARS',
'originAirport':oa,
'destinationAirport':da,
'outboundDateString':ods,
'returnDateString':'',
'adultPassengerCount':'1',
'seniorPassengerCount':'0',
'promoCode':'',
'submitButton':'true'
}
with requests.Session() as s:
r = s.post(url,headers = rh )
# soup = BeautifulSoup(r.content,'html.parser')
# soup = BeautifulSoup(r.content,'lxml')
print(r)
print(r.content)
print (oa + ' ' + da + ' ' + ods + ' ' + str(datetime.datetime.now()))
return
#db = MySQLdb.connect(host="localhost",user="root",passwd="vikram",db="garmin")
rcount = 0
tdelta = 55
#print(strt_date)
threads = []
count = 1
thr_max = 2
r = ["MCI","DEN","MCI","MDW","MCI","DAL"]
strt_date = (datetime.date.today() + datetime.timedelta(days=tdelta)).strftime("%m/%d/%Y")
while count < 2:
t = threading.Thread(name=r[count-1]+r[count],target=worker,args=(r[count-1],r[count],strt_date))
threads.append(t)
t.start()
count = count + 2
When you say looked at the session info from inspector of the browser, I'm assuming you meant the network tab. If that's the case, are you sure you noted the data being sent properly?
Here's the URL that gets sent by the browser, following which the page you required is fetched:
url = 'https://www.southwest.com/flight/search-flight.html'
You didn't use headers in your request, which, in my opinion, should be passed compulsorily in some cases. Here are the headers that the browser passes:
:authority:www.southwest.com
:method:POST
:path:/flight/search-flight.html
:scheme:https
accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-encoding:gzip, deflate, br
accept-language:en-US,en;q=0.9
cache-control:max-age=0
content-length:564
content-type:application/x-www-form-urlencoded
origin:https://www.southwest.com
referer:https://www.southwest.com/flight/search-flight.html?int=HOMEQBOMAIR
upgrade-insecure-requests:1
user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36
Note:
I removed the cookie header, because that would be taken care of by requests if you're using session.
The first four headers (those that begin with a colon (':')) cannot be passed in Python's requests; so, I skipped them.
Here's the dict that I used to pass the headers:
rh = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'content-length': '564',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://www.southwest.com',
'referer': 'https://www.southwest.com/flight/search-flight.html?int=HOMEQBOMAIR',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
}
And here is the form data sent by browser:
fd = {
'toggle_selfltnew': '',
'toggle_AggressiveDrawers': '',
'transitionalAwardSelected': 'false',
'twoWayTrip': 'true',
'originAirport': 'MCI',
# 'originAirport_displayed': 'Kansas City, MO - MCI',
'destinationAirport': 'DAL',
# 'destinationAirport_displayed': 'Dallas (Love Field), TX - DAL',
'airTranRedirect': '',
'returnAirport': 'RoundTrip',
'returnAirport_displayed': '',
'outboundDateString': '02/28/2018',
'outboundTimeOfDay': 'ANYTIME',
'returnDateString': '03/01/2018',
'returnTimeOfDay': 'ANYTIME',
'adultPassengerCount': '1',
'seniorPassengerCount': '0',
'promoCode': '',
'fareType': 'DOLLARS',
'awardCertificateToggleSelected': 'false',
'awardCertificateProductId': ''
}
Note that I commented out two of the items above, but it didn't make any difference. I assumed you'd be having only the location codes and not the full name. If you do have them or if you can extract them from the page, you can send those as well along with other data.
I don't know if it makes any difference, but I used data instead of params:
with requests.Session() as s:
r = s.post(url, headers = rh, data = fd)
soup = BeautifulSoup(r.content, 'lxml')
Finally, here is the result:
>>> soup.find('span', {'class': 'currency_symbol'}).text
'$'

Categories

Resources