Scrape multiple pages with json - python

I am trying to scrape multiple pages with json but they will provide me error
import requests
import json
import pandas as pd
headers = {
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8,pt;q=0.7',
'Connection': 'keep-alive',
'Origin': 'https://www.nationalhardwareshow.com',
'Referer': 'https://www.nationalhardwareshow.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'cross-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'accept': 'application/json',
'content-type': 'application/x-www-form-urlencoded',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'x-algolia-agent': 'Algolia for vanilla JavaScript 3.27.1',
'x-algolia-application-id': 'XD0U5M6Y4R',
'x-algolia-api-key': 'd5cd7d4ec26134ff4a34d736a7f9ad47',
}
for i in range(0,4):
data = '{"params":"query=&page={i}&facetFilters=&optionalFilters=%5B%5D"}'
resp = requests.post('https://xd0u5m6y4r-dsn.algolia.net/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query', params=params, headers=headers, data=data).json()
req_json=resp
df = pd.DataFrame(req_json['hits'])
f = pd.DataFrame(df[['name','representedBrands','description']])
print(f)
the error :
Traceback (most recent call last):
File "e:\ScriptScraping\Extract data from json\uk.py", line 31, in <module>
df = pd.DataFrame(req_json['hits']) KeyError: 'hits'

Try to concatenate the variable i with data parameter
import requests
import json
import pandas as pd
headers = {
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8,pt;q=0.7',
'Connection': 'keep-alive',
'Origin': 'https://www.nationalhardwareshow.com',
'Referer': 'https://www.nationalhardwareshow.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'cross-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'accept': 'application/json',
'content-type': 'application/x-www-form-urlencoded',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
params = {
'x-algolia-agent': 'Algolia for vanilla JavaScript 3.27.1',
'x-algolia-application-id': 'XD0U5M6Y4R',
'x-algolia-api-key': 'd5cd7d4ec26134ff4a34d736a7f9ad47'
}
lst=[]
for i in range(0,4):
data = '{"params":"query=&page='+str(i)+'&facetFilters=&optionalFilters=%5B%5D"}'
resp = requests.post('https://xd0u5m6y4r-dsn.algolia.net/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query', params=params, headers=headers, data=data).json()
req_json=resp
df = pd.DataFrame(req_json['hits'])
f = pd.DataFrame(df[['name','representedBrands','description']])
lst.append(f)
#print(f)
d=pd.concat(lst)
print(d)

It is returning status_code 400 as the request is bad. You are sending wrongly formatted data. Change:
data = '{"params":"query=&page={i}&facetFilters=&optionalFilters=%5B%5D"}'
To
data = '{"params":"query=&page='+str(i)+'&facetFilters=&optionalFilters=%5B%5D"}'
For it to work. Hope I could help.

Related

i have give all the payload data but not working

Can someone please help me with proxy.webshare.io/register on creating account please?
i have tried
task_id = captcha.create_task(website_url="https://proxy.webshare.io/register/", website_key="6LeHZ6UUAAAAAKat_YS--O2tj_by3gv3r_l03j9d")
print("Wait for respone")
print(task_id)
respone = captcha.join_task_result(task_id).get("gRecaptchaResponse")
print("Recieved key: " + respone)
source = client.get('https://proxy.webshare.io/register').content
soup = BeautifulSoup(source, 'html.parser')
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Content-Type': 'application/x-www-form-urlencoded',
'sec-ch-ua': '"Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
val = soup.find("input", {'id': 'a'}).get("value")
print(val)
if 'csrftoken' in client.cookies:
csrftoken = client.cookies['csrftoken']
else:
csrftoken = client.cookies['csrf']
print(csrftoken)
mail = em_f_n+ "#gmail.com"
passz = pw_f_n
print(mail)
print(passz)
data = {
"csrfmiddlewaretoken": csrftoken,
"next": "",
"a": val,
"email": mail,
"password1": passz,
"g-recaptcha-response": respone
}
r = client.post("https://proxy.webshare.io/register", json=data, headers=header)
print(r.context)
print(r.status_code)
but it just return 200 status code(although it returned all of data above) and didn't work at all please help

Extract href from pages using json and scrape multipe pages

import requests
url = "https://baroul-timis.ro/get-av-data?param=toti-avocatii"
payload={}
headers = {
'Accept': 'text/html, */*; q=0.01',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8,pt;q=0.7',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': '_csrf-frontend=ccc4c9069d6ad3816ea693a980ecbebda2770e9448ffe9fed17cdf397a5e2851a%3A2%3A%7Bi%3A0%3Bs%3A14%3A%22_csrf-frontend%22%3Bi%3A1%3Bs%3A32%3A%22J3N0AJG6xybnGl91dfrlt-qMOk3hfbQ6%22%3B%7D',
'Pragma': 'no-cache',
'Referer': 'https://baroul-timis.ro/tabloul-avocatilor/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
resp= requests.request("GET", url, headers=headers, data=payload).json()
sample=resp['data']
for test in sample:
product=test['actions']
print(product)
they give me these url:
<i class="fas fa-address-book" aria-hidden="true"></i>
But I want to these only and also I want to scrape multiple pages but the link of all the pages is same :
/tabloul-avocatilor/avocat/av-felicia-petre
To get all 948 names and links you can use next example:
import requests
from bs4 import BeautifulSoup
url = "https://baroul-timis.ro/get-av-data?param=toti-avocatii"
data = requests.get(url).json()
for i, d in enumerate(data["data"], 1):
first_name = d["firstname"]
last_name = BeautifulSoup(d["lastname"], "html.parser").text
link = BeautifulSoup(d["actions"], "html.parser").a["href"]
print(
"{:<3} {:<30} {:<30} {}".format(
i, first_name[:29], last_name[:29], link
)
)
Prints:
...
943 Adela-Ioana FRUNZĂ /tabloul-avocatilor/avocat/av-adela-frunza
944 Marina GLIGOR-VOLSCHI /tabloul-avocatilor/avocat/av-marina-gligor-volschi
945 Denis-Alexandru TOTH /tabloul-avocatilor/avocat/av-denis-toth
946 Raluca-Roxana ȘURIANU /tabloul-avocatilor/avocat/av-raluca-surianu
947 Alexandra-Bianka CIOBANU /tabloul-avocatilor/avocat/av-alexandra-ciobanu
948 Alexandra-Oana OLARIU /tabloul-avocatilor/avocat/av-alexandra-olariu

python request trying to login session barchart

I am trying to create a session login into barchart with no luck using requests but i am not sure what i'm missing.
I am always geting 500 error
code:
import requests
def main():
site_url = "https://www.barchart.com/login"
payload = {
'email': 'user',
'password': 'pass',
}
headers = {
'authority': 'www.barchart.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'es-ES,es;q=0.9,en;q=0.8,gl;q=0.7',
'cache-control': 'no-cache',
'cookie': '_gcl_au=1.1.1986418753.1652987508; _fbp=fb.1.1652987508275.1121978397; _ga=GA1.2.581266708.1652987508; __gads=ID=4850c10bd629ae1e-227a8d8bb6d30042:T=1652987552:RT=1652987552:S=ALNI_MYiYqDwr6eWdC-6Q67HlsmfGR9TFQ; _admrla=2.2-8105254c8da36a72-ab5313fc-d7a7-11ec-8803-06c18b5dfeba; pubcv=%7B%7D; tcf2cookie=CPZN1UAPZN1UAAJAEBENCQCsAP_AAEPAACiQIxNd_X__bX9j-_5_bft0eY1P9_r3_-QzjhfNs-8F3L_W_L0Xw2E7NF36pq4KuR4Eu3LBIQNlHMHUTUmwaokVrzHsak2cpyNKJ7LEknMZO2dYGH9Pn9lDuYKY7_5___bx3D-v_t_-39T378Xf3_d5_2_--vCfV599jbn9fV_7_9nP___9v-_8_________wRgAJMNS8gC7MscGTSMIoUQIwrCQqgUAFFAMLRFYAODgp2VgEuoIWACAVIRgRAgxBRgwCAAQSAJCIgJACwQCIAiAQAAgARAIQAETAILACwMAgAFANCxACgAECQgyICI5TAgKkSiglsrEEoK9jTCAOs8AKBRGRUACJJIQSAgJCwcxwBICXiyQNMUL5ACMEKAAAAA.f_gACHgAAAAA; webinar107WebinarClosed=true; _gid=GA1.2.1453697314.1653243141; _hjid=e2d759d4-08f2-4e97-b4bf-15667795cb0e; _hjSessionUser_2563157=eyJpZCI6ImMyZWQyMTQ2LTZmZGItNTViNi1hMzljLTlkODIyOTAyYTlkYSIsImNyZWF0ZWQiOjE2NTMyNDMxNTkwMTAsImV4aXN0aW5nIjp0cnVlfQ==; __aaxsc=2; _hjSession_2563157=eyJpZCI6Ijc2MDE2ZDE3LTRlN2MtNGFiNS05OTgzLTRjNjY5YTg3ODM0YyIsImNyZWF0ZWQiOjE2NTMyNTA4MDE3MzQsImluU2FtcGxlIjpmYWxzZX0=; _hjAbsoluteSessionInProgress=0; _hjIncludedInPageviewSample=1; _hjIncludedInSessionSample=0; market=eyJpdiI6ImdJTllrNEpHMnF6U3B3THRoQ0dZTkE9PSIsInZhbHVlIjoibjkwM3lrYkNORXU0cDNhV25VUHNYUTZ6eFlCOHVQRC9GOEJhM2VJK0RtN21IYjFWQVZMVlRTYXZpZk5idWNLSCIsIm1hYyI6IjE5NmY0MGI3OGNjNjVhZjY5ZWU5N2FkZjY1NWVlYzVjZThmMGM3Mjk0YjljNWEwZjI0YzBjMjQwOThmYTAyN2EifQ%3D%3D; bcFreeUserPageView=0; cto_bundle=JsBghF84Rm9rTThnUWNFdEM0blV1Q1lFUUVha3huMEY1NkpnZFVjblpsNyUyRk8zUFBZNUM2dGp1Q054bkElMkYyR09aaUtRckpUMHViJTJCQjJ2cEg0OGt2c3B6QllxWUczeWRmZEJVUnUlMkZ6MnRrT0xvakxnWXIxeGJtRUdYMXJVVFglMkZ3RWJDSUFEeFFqZDZIN3pSemtZRjZrdndmazNnJTNEJTNE; laravel_token=eyJpdiI6IkNjWW9EUkI1OGdkT1duRVNEMlU1U2c9PSIsInZhbHVlIjoiRXMyRXlsRnpzbFlvdkpRL0RSU0lPeC85Zkx0MGJkdi9mczQ1Nk9WUFlNbGorTlVEUDBGd0VhTysrTHhUWGxRNTVaa3lzMVFOZ0pMRjFIYklFQW9TUlBFT0pZN1BjOUU0TldYVEZjbmZBcFBBWWViRVFHTzFVazFHMHZ0bUlSbEhndzdRNEs0L0xMUjc3cHlKL3FEdGJuTDN5VktaRlVhTTdtYlpLVWM3TDlpWGlBWEtKa3p4Rjh4Ty9zOXVtSGF5djRTVHpPQWZZRFNQQTlpNGNnQURNclpucjlVMG8xaUc0U2NRejdjU25Td0hIb3pLNkxwS1IzcG9KU3p2TUYybmMyajM5cmFsWlhOM0xhQS9tR0xDNktPdHcxK0lKR0JHNE5qUjZPQnlTZUNndkFvQ0l6QjhaVWxlbEJoVWVOKzAiLCJtYWMiOiJhMDc0ZmVmN2I4NGMxNjE2ZWRiM2IwMjY3YmNhNzY3MjZjNjA4ODU5MTQ0MmY2YjMwNWVjZDA2NDIwM2E1MTljIn0%3D; XSRF-TOKEN=eyJpdiI6IlUzSzRkTExjZElxY2FGNGlCVWlNQ3c9PSIsInZhbHVlIjoiM2Y3QllmVGViMEJEOEdjOHNXR1lBdHd0enQxRnp4YlJmVXBiTDhSNjZNYTRYK3lLajVESWg0QTlxcHpLS2pHdDBKYjkrWEV0eHcvQzE0U2J2QnpwR2dQMWVVN1RlNTlhVlJ1M2FlSjhBd2hRd09zVC9YbG8rN1ZVcGQ1OWkwNXMiLCJtYWMiOiI2ZTkzOWMwY2E4OGQyYmU0ZTI0NDc3NjM0NDhmZjAwOGFjYzhiMWQxMGU0ZTdmMDM5YmJmMmUwZmViNDFiODRlIn0%3D; laravel_session=eyJpdiI6Ijk4R1Z1c0U4L0R5cTU0TjBFTWdqY3c9PSIsInZhbHVlIjoiR1lHTVI0c1JPblc2VENtY0thMy9ROXhMYXE2VDZwU3BKNGhpZUUxc2QzOFJySlNhTWVwWnh1RzdTaitDYUdpRXlIckszV0NEL1JCbkpXN3YzamtOWEIvUFJYZGhDMzFVaysrSUJvL3NTQ2NaYndiQjIwbWIxcVZmTGR4Uk5lZVoiLCJtYWMiOiI1YjczYzczZmIyMTQ4NzMxODIzMGIwZjk2MTdkNzU2YjU0N2NjZjkxMDViOGI1YTIzMzA5ZGY1OGY1OWQyYjM0In0%3D; _gat_UA-2009749-51=1; IC_ViewCounter_www.barchart.com=10; aasd=15%7C1653252462025; _awl=2.1653252699.0.5-06f6ddfaf139e746127acfcfca00de3c-6763652d6575726f70652d7765737431-0',
'origin': 'https://www.barchart.com',
'pragma': 'no-cache',
'referer': 'https://www.barchart.com/',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36',
}
with requests.Session() as req:
req.headers.update(headers)
res=req.post(site_url,data=payload)
print(res)
if __name__ == '__main__':
main()

Getting Response 500 python

I want to get access from this website https://www.truepeoplesearch.com/ but I am getting 500 error using requests I have also try proxies Api to get 200 response but still getting same response
Here is my code:
import requests
# headers = {
# 'authority': 'www.truepeoplesearch.com',
# 'method': 'GET',
# 'path': '/?__cf_chl_rt_tk=yRTk.U51_2F3gk2zvxX_GLO5MgGmfwXsQeScGiGloJM-1637358329-0-gaNycGzNCpE',
# 'scheme': 'https',
# 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'accept-encoding': 'gzip, deflate, br',
# 'accept-language': 'en-US,en;q=0.9',
# 'cache-control': 'max-age=0',
# 'cookie':' __cf_bm=jDWsHwlBvdk987UFEnrW63LelWWz03HXgZg3jDPaYY0-1637358168-0-AeyRiZHGcxyaD4j/7LGGq1aVmo5sBj/qmFX58OY4gfcUtvfzaG1exKf4HiYNNAlSaqm6LZ3MWB2UBgaOeIugrxA=; aegis_uid=26435771721; aegis_tid=direct; _ga=GA1.2.1056913930.1637358268; _gid=GA1.2.364813628.1637358268; cf_chl_prog=F13; cf_chl_rc_m=7',
# 'referer': 'https://www.truepeoplesearch.com/?__cf_chl_rt_tk=yRTk.U51_2F3gk2zvxX_GLO5MgGmfwXsQeScGiGloJM-1637358329-0-gaNycGzNCpE',
# 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
# 'sec-ch-ua-mobile':' ?0',
# 'sec-ch-ua-platform':' "Windows"',
# 'sec-fetch-dest':' document',
# 'sec-fetch-mode': 'navigate',
# 'sec-fetch-site':' same-origin',
# 'upgrade-insecure-requests': '1',
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
# }
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
payload = {'api_key': 'dd6a50aba760cc67e0276d840a7989cd', 'url': 'https://www.truepeoplesearch.com/'}
r = requests.get('http://api.scraperapi.com', params=payload, headers= headers)
print(r)
I have try both headers but didn't get 200 response, Anybody have any idea what I am doing wrong?

Parsing a table with Pandas

I am trying to parse the table from https://alreits.com/screener
I have tried this:
main_url = 'https://alreits.com/screener'
r = requests.get(main_url)
df_list = pd.read_html(r.text)
df = df_list[0]
print(df)
but pandas cant find the table.
I have also tried using BeautifulSoup4 but it didnt seem to give better results.
This is the selector: #__next > div.MuiContainer-root.MuiContainer-maxWidthLg > div.MuiBox-root.jss9.Card__CardContainer-feksr6-0.fpbzHQ.ScreenerTable__CardContainer-sc-1c5wxgl-0.GRrTj > div > table > tbody
This is the full xPath: /html/body/div/div[2]/div[2]/div/table/tbody
I am trying to get the Stock symbol (under name),sector,score and market cap. The other data would be nice to have but is not necessary.
Thank You!
I found one JSON url from the dev tool. This is an easy way to extract the table instead of using selenium. Use post request to extract the data.
import requests
headers = {
'authority': 'api.alreits.com:8080',
'sec-ch-ua': '"Google Chrome";v="93", " Not;A Brand";v="99", "Chromium";v="93"',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
'sec-ch-ua-platform': '"Windows"',
'content-type': 'application/json',
'accept': '*/*',
'origin': 'https://alreits.com',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://alreits.com/',
'accept-language': 'en-US,en;q=0.9',
}
params = (
('page', '0'),
('size', '500'),
('sort', ['marketCap,desc', 'score,desc', 'ffoGrowth,desc']),
)
data = '{"filters":[]}'
response = requests.post('https://api.alreits.com:8080/api/reits/screener', headers=headers, params=params, data=data)
df = pd.DataFrame(response.json())
The code below will return the data you are looking for.
import requests
import pprint
import json
headers = {'content-type': 'application/json',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
r = requests.post(
'https://api.alreits.com:8080/api/reits/screener?page=0&size=500&sort=marketCap,desc&sort=score,desc&sort=ffoGrowth,desc',
headers=headers, data=json.dumps({'filters':[]}))
if r.status_code == 200:
pprint.pprint(r.json())
# Now you have the data - do what you want with it
else:
print(r.status_code)

Categories

Resources