python web scraping post form data using requests not working - python

I am trying to post input data into a form using a requests.session and it's returning a 500 status.
I am expecting to see the search results retrieved.
I was able to get around a previous login issue with __RequestVerificationToken and cookies - thanks to the help of Bertrand Martel. The next step in my process is to get the Search page, which I was able to get successfully. Now failing when I try to post data into the date fields on the form, which make up the search criteria. Works when I manually complete the form and press submit. All seems very straightforward to me, but not sure why it won't work. Is it still a cookies issue? Any help would be appreciated.
Here is my code:
import requests
from bs4 import BeautifulSoup
EMAIL = 'myemail#gmail.com'
PASSWORD = 'somepwd'
LOGIN_URL = 'https://www.idocmarket.com/Security/LogOn'
SEARCH_URL = 'https://www.idocmarket.com/RIOCO/Document/Search'
s = requests.Session()
s.get(LOGIN_URL)
result = s.post(LOGIN_URL, data = {
"Login.Username": EMAIL,
"Login.Password": PASSWORD
})
soup = BeautifulSoup(result.text, "html.parser")
# Report successful login
print("Login succeeded: ", result.ok)
print("Status code:", result.status_code)
result = s.get(SEARCH_URL)
auth_token = soup.find("input", {'name': '__RequestVerificationToken'}).get('value')
print('auth token:', auth_token )
print("Get Search succeaeded: ", result.ok)
print("get Search Statusa code:", result.status_code)
result = s.post(SEARCH_URL, data = {
"__RequestVerificationToken": auth_token,
"StartRecordDate": "03/01/2019",
"EndRecordDate": "03/31/2019",
"StartDocNumber": "",
"EndDocNumber": "",
"Book": "",
"Page": "",
"Instrument": "",
"InstrumentGroup": "",
"PartyType": "Either",
"PartyMatchType": "Contains",
"PartyName": "",
"Subdivision": "",
"StartLot": "",
"EndLot": "",
"Block": "",
"Section":"",
"Township": "",
"Range": "",
"Legal": "",
"CountyKey": "RIOCO"
})
print("post Dates succeeded: ", result.ok)
print("post Dates Status code:", result.status_code)
print(result.text)

It seems that this time, the xsrf token is needed in the post along with all the existing parameters. A simple solution is to get all the input value & pass it to the request :
import requests
from bs4 import BeautifulSoup
LOGIN_URL = 'https://www.idocmarket.com/Security/LogOn'
SEARCH_URL = 'https://www.idocmarket.com/RIOCO/Document/Search'
EMAIL = 'myemail#gmail.com'
PASSWORD = 'somepwd'
s = requests.Session()
s.get(LOGIN_URL)
r = s.post(LOGIN_URL, data = {
"Login.Username": EMAIL,
"Login.Password": PASSWORD
})
if (r.status_code == 200):
r = s.get(SEARCH_URL)
soup = BeautifulSoup(r.text, "html.parser")
payload = {}
for input_item in soup.select("input"):
if input_item.has_attr('name'):
payload[input_item["name"]] = input_item["value"]
payload["StartRecordDate"] = '09/01/2019'
payload["EndRecordDate"] = '09/30/2019'
r = s.post(SEARCH_URL, data = payload)
soup = BeautifulSoup(r.text, "html.parser")
print(soup)
else:
print("authentication failure")
Also using comprehension list for the payload you can write :
temp_pl = [
(t['name'], t['value'])
for t in soup.select("input")
if t.has_attr('name')
]
payload = dict(temp_pl)
payload["StartRecordDate"] = '09/01/2019'
payload["EndRecordDate"] = '09/30/2019'

Related

How do I extract all results from a GET request that spans multiple pages?

I have successfully written code that calls an API and then converts the results into a DataFrame.
wax_wallet = "zqsfm.wam"
# Get Assets from AtomicHub API
response1 = requests.get(
"https://wax.api.atomicassets.io/atomicassets/v1/assets?"
f"owner={wax_wallet}"
"&collection_whitelist=nftdraft2121"
"&page=1"
"&limit=1000"
"&order=asc"
"&sort=name")
# Save Response as JSON
json_assets = response1.json()
# Convert JSON to DataFrame
df = pd.json_normalize(json_assets['data'])
This API returns at most 1000 items per page so I need to have it loop through as many pages as needed and ultimately get the results stored into a DataFrame.
I attempted to solve it with the below code, but was unsuccessful.
asset_count = 2500
pages = int(math.ceil(asset_count / 1000))
# Get Assets from AtomicHub API
all_assets = []
for page in range(1, pages):
url = f'https://wax.api.atomicassets.io/atomicassets/v1/assets?owner={wax_wallet}' \
f'&collection_whitelist=nftdraft2121&page={page}&limit=1000&order=asc&sort=name'
response = rq.get(url)
all_assets.append(json.loads(response.text))["response"]
Thanks in advance for any help!
You can turn them into dataframes and then concatenate the individual frames into a final result:
def get_page(page_num):
wax_wallet = "zqsfm.wam"
response = requests.get(
"https://wax.api.atomicassets.io/atomicassets/v1/assets",
params={
"owner": wax_wallet,
"collection_whitelist": "nftdraft2121",
"page": page_num,
"limit": "1000",
"order": "asc",
"sort": "name"
}
)
json_assets = response.json()
return pd.json_normalize(json_assets['data'])
# The number of pages you want
number_of_pages_requested = 10
# Get all pages as dataframes
pages = [get_page(n + 1) for n in range(number_of_pages_requested)]
# Combine pages to single dataframe
df = pd.concat(pages)
Edit: updated using params based on Olvin Roght's comment
Edit 2: fixed indexing error
I think this should help:-
import requests
all_assets = []
URL = 'https://wax.api.atomicassets.io/atomicassets/v1/assets'
params = {
'owner': 'zqsfm.wam',
'collection_whitelist': 'nftdraft2121',
'page': 1,
'order': 'asc',
'sort': 'name',
'limit': 1000
}
with requests.Session() as session:
while True:
print(f"Getting page {params['page']}")
response = session.get(URL, params=params)
response.raise_for_status()
_j = response.json()
data = _j['data']
if len(data) > 0:
all_assets.append(data)
params['page'] += 1
else:
break
print('Done')

How can I extract values from Tableau on this webpage

I am trying to extract the "mobility index" values for each state and county from this webpage:
https://www.cuebiq.com/visitation-insights-mobility-index/
The preferred output would be a panel data of place (state/county) by date for all available places and dates.
There is another thread (How can I scrape tooltips value from a Tableau graph embedded in a webpage) with a similar question. I tried to follow the solution there but it doesn't seem to work for my case.
Thanks a lot in advance.
(A way that I have tried is to download PDF files generated from Tableau, which would contain all counties' value on a specific date. However, I still need to find a way to make request for each date in the data. Anyway, let me know if you have a better idea than this route).
This tableau data url doesn't return any data. In fact, it only render images of the values (canvas probably) and I'm guessing it detects click based on coordinate. Probably, it's made this way to cache the value and render quickly.
But when you click on a state, it actually returns data but it seems it doesn't always returns the result for the state (but works the individual county).
The solution I've found is to use the tooltip to get the data for the state. When you click the state, it generates a request like this :
POST https://public.tableau.com/{path}/{session_id}/commands/tabsrv/render-tooltip-server
with the following form param :
worksheet: US Map - State - CMI
dashboard: CMI
tupleIds: [18]
vizRegionRect: {"r":"viz","x":496,"y":148,"w":0,"h":0,"fieldVector":null}
allowHoverActions: false
allowPromptText: true
allowWork: false
useInlineImages: true
where tupleIds: [18] refers to the index of the state in a list of states in reverse alphabetical order like this :
stateNames = ["Wyoming","Wisconsin","West Virginia","Washington","Virginia","Vermont","Utah","Texas","Tennessee","South Dakota","South Carolina","Rhode Island","Pennsylvania","Oregon","Oklahoma","Ohio","North Dakota","North Carolina","New York","New Mexico","New Jersey","New Hampshire","Nevada","Nebraska","Montana","Missouri","Mississippi","Minnesota","Michigan","Massachusetts","Maryland","Maine","Louisiana","Kentucky","Kansas","Iowa","Indiana","Illinois","Idaho","Georgia","Florida","District of Columbia","Delaware","Connecticut","Colorado","California","Arkansas","Arizona","Alabama"]
It gives a json with the html of the tooltip which has the CMI and YoY values you want to extract :
{
"vqlCmdResponse": {
"cmdResultList": [{
"commandName": "tabsrv:render-tooltip-server",
"commandReturn": {
"tooltipText": "{\"htmlTooltip\": \"<HTML HERE WITH THE VALUES>\"}]},\"overlayAnchors\":[]}"
}
}]
}
}
The only caveat is that you'll hava to make one request per state :
import requests
from bs4 import BeautifulSoup
import json
import time
data_host = "https://public.tableau.com"
r = requests.get(
f"{data_host}/views/CMI-2_0/CMI",
params= {
":showVizHome":"no",
}
)
soup = BeautifulSoup(r.text, "html.parser")
tableauData = json.loads(soup.find("textarea",{"id": "tsConfigContainer"}).text)
dataUrl = f'{data_host}{tableauData["vizql_root"]}/bootstrapSession/sessions/{tableauData["sessionid"]}'
r = requests.post(dataUrl, data= {
"sheet_id": tableauData["sheetId"],
})
data = []
stateNames = ["Wyoming","Wisconsin","West Virginia","Washington","Virginia","Vermont","Utah","Texas","Tennessee","South Dakota","South Carolina","Rhode Island","Pennsylvania","Oregon","Oklahoma","Ohio","North Dakota","North Carolina","New York","New Mexico","New Jersey","New Hampshire","Nevada","Nebraska","Montana","Missouri","Mississippi","Minnesota","Michigan","Massachusetts","Maryland","Maine","Louisiana","Kentucky","Kansas","Iowa","Indiana","Illinois","Idaho","Georgia","Florida","District of Columbia","Delaware","Connecticut","Colorado","California","Arkansas","Arizona","Alabama"]
for stateIndex, state in enumerate(stateNames):
time.sleep(0.5) #for throttling
r = requests.post(f'{data_host}{tableauData["vizql_root"]}/sessions/{tableauData["sessionid"]}/commands/tabsrv/render-tooltip-server',
data = {
"worksheet": "US Map - State - CMI",
"dashboard": "CMI",
"tupleIds": f"[{stateIndex+1}]",
"vizRegionRect": json.dumps({"r":"viz","x":496,"y":148,"w":0,"h":0,"fieldVector":None}),
"allowHoverActions": "false",
"allowPromptText": "true",
"allowWork": "false",
"useInlineImages": "true"
})
tooltip = json.loads(r.json()["vqlCmdResponse"]["cmdResultList"][0]["commandReturn"]["tooltipText"])["htmlTooltip"]
soup = BeautifulSoup(tooltip, "html.parser")
rows = [
t.find("tr").find_all("td")
for t in soup.find_all("table")
]
entry = { "state": state }
for row in rows:
if (row[0].text == "Mobility Index:"):
entry["CMI"] = "".join([t.text.strip() for t in row[1:]])
if row[0].text == "YoY (%):":
entry["YoY"] = "".join([t.text.strip() for t in row[1:]])
print(entry)
data.append(entry)
print(data)
Try this on repl.it
To get the county information it's the same as this post using the select endpoint which gives you the data with the same format as the post you've linked in your question
The following will extract data for all county and state :
import requests
from bs4 import BeautifulSoup
import json
import time
data_host = "https://public.tableau.com"
worksheet = "US Map - State - CMI"
dashboard = "CMI"
r = requests.get(
f"{data_host}/views/CMI-2_0/CMI",
params= {
":showVizHome":"no",
}
)
soup = BeautifulSoup(r.text, "html.parser")
tableauData = json.loads(soup.find("textarea",{"id": "tsConfigContainer"}).text)
dataUrl = f'{data_host}{tableauData["vizql_root"]}/bootstrapSession/sessions/{tableauData["sessionid"]}'
r = requests.post(dataUrl, data= {
"sheet_id": tableauData["sheetId"],
})
data = []
stateNames = ["Wyoming","Wisconsin","West Virginia","Washington","Virginia","Vermont","Utah","Texas","Tennessee","South Dakota","South Carolina","Rhode Island","Pennsylvania","Oregon","Oklahoma","Ohio","North Dakota","North Carolina","New York","New Mexico","New Jersey","New Hampshire","Nevada","Nebraska","Montana","Missouri","Mississippi","Minnesota","Michigan","Massachusetts","Maryland","Maine","Louisiana","Kentucky","Kansas","Iowa","Indiana","Illinois","Idaho","Georgia","Florida","District of Columbia","Delaware","Connecticut","Colorado","California","Arkansas","Arizona","Alabama"]
for stateIndex, state in enumerate(stateNames):
time.sleep(0.5) #for throttling
r = requests.post(f'{data_host}{tableauData["vizql_root"]}/sessions/{tableauData["sessionid"]}/commands/tabsrv/render-tooltip-server',
data = {
"worksheet": worksheet,
"dashboard": dashboard,
"tupleIds": f"[{stateIndex+1}]",
"vizRegionRect": json.dumps({"r":"viz","x":496,"y":148,"w":0,"h":0,"fieldVector":None}),
"allowHoverActions": "false",
"allowPromptText": "true",
"allowWork": "false",
"useInlineImages": "true"
})
tooltip = json.loads(r.json()["vqlCmdResponse"]["cmdResultList"][0]["commandReturn"]["tooltipText"])["htmlTooltip"]
soup = BeautifulSoup(tooltip, "html.parser")
rows = [
t.find("tr").find_all("td")
for t in soup.find_all("table")
]
entry = { "state": state }
for row in rows:
if (row[0].text == "Mobility Index:"):
entry["CMI"] = "".join([t.text.strip() for t in row[1:]])
if row[0].text == "YoY (%):":
entry["YoY"] = "".join([t.text.strip() for t in row[1:]])
r = requests.post(f'{data_host}{tableauData["vizql_root"]}/sessions/{tableauData["sessionid"]}/commands/tabdoc/select',
data = {
"worksheet": worksheet,
"dashboard": dashboard,
"selection": json.dumps({
"objectIds":[stateIndex+1],
"selectionType":"tuples"
}),
"selectOptions": "select-options-simple"
})
entry["county_data"] = r.json()["vqlCmdResponse"]["layoutStatus"]["applicationPresModel"]["dataDictionary"]["dataSegments"]
print(entry)
data.append(entry)
print(data)

Scrape Tables on Multiple Pages with Single URL

I am trying to scrape data from Fangraphs. The tables are split into 21 pages but all of the pages use the same url. I am very new to webscraping (or python in general), but Fangraphs does not have a public API so scraping the page seems to be my only option. I am currently using BeautifulSoup to parse the HTML code and I am able to scrape the initial table, but that only contains the first 30 players, but I want the entire player pool. Two days of web searching and I am stuck. Link and my current code are below. I know they have a link to download the csv file, but that gets tedious through out the season and I would like expedite the data harvesting process. Any direction would be helpful, thank you.
https://www.fangraphs.com/projections.aspx?pos=all&stats=bat&type=fangraphsdc
import requests
import pandas as pd
url = 'https://www.fangraphs.com/projections.aspx?pos=all&stats=bat&type=fangraphsdc&team=0&lg=all&players=0'
response = requests.get(url, verify=False)
# Use BeautifulSoup to parse the HTML code
soup = BeautifulSoup(response.content, 'html.parser')
# changes stat_table from ResultSet to a Tag
stat_table = stat_table[0]
# Convert html table to list
rows = []
for tr in stat_table.find_all('tr')[1:]:
cells = []
tds = tr.find_all('td')
if len(tds) == 0:
ths = tr.find_all('th')
for th in ths:
cells.append(th.text.strip())
else:
for td in tds:
cells.append(td.text.strip())
rows.append(cells)
# convert table to df
table = pd.DataFrame(rows)
import requests
from bs4 import BeautifulSoup
import pandas as pd
params = {
"pos": "all",
"stats": "bat",
"type": "fangraphsdc"
}
data = {
'RadScriptManager1_TSM': 'ProjectionBoard1$dg1',
"__EVENTTARGET": "ProjectionBoard1$dg1",
'__EVENTARGUMENT': 'FireCommand:ProjectionBoard1$dg1$ctl00;PageSize;1000',
'__VIEWSTATEGENERATOR': 'C239D6F0',
'__SCROLLPOSITIONX': '0',
'__SCROLLPOSITIONY': '1366',
"ProjectionBoard1_tsStats_ClientState": "{\"selectedIndexes\":[\"0\"],\"logEntries\":[],\"scrollState\":{}}",
"ProjectionBoard1_tsPosition_ClientState": "{\"selectedIndexes\":[\"0\"],\"logEntries\":[],\"scrollState\":{}}",
"ProjectionBoard1$rcbTeam": "All+Teams",
"ProjectionBoard1_rcbTeam_ClientState": "",
"ProjectionBoard1$rcbLeague": "All",
"ProjectionBoard1_rcbLeague_ClientState": "",
"ProjectionBoard1_tsProj_ClientState": "{\"selectedIndexes\":[\"5\"],\"logEntries\":[],\"scrollState\":{}}",
"ProjectionBoard1_tsUpdate_ClientState": "{\"selectedIndexes\":[],\"logEntries\":[],\"scrollState\":{}}",
"ProjectionBoard1$dg1$ctl00$ctl02$ctl00$PageSizeComboBox": "30",
"ProjectionBoard1_dg1_ctl00_ctl02_ctl00_PageSizeComboBox_ClientState": "",
"ProjectionBoard1$dg1$ctl00$ctl03$ctl01$PageSizeComboBox": "1000",
"ProjectionBoard1_dg1_ctl00_ctl03_ctl01_PageSizeComboBox_ClientState": "{\"logEntries\":[],\"value\":\"1000\",\"text\":\"1000\",\"enabled\":true,\"checkedIndices\":[],\"checkedItemsTextOverflows\":false}",
"ProjectionBoard1_dg1_ClientState": ""
}
def main(url):
with requests.Session() as req:
r = req.get(url, params=params)
soup = BeautifulSoup(r.content, 'html.parser')
data['__VIEWSTATE'] = soup.find("input", id="__VIEWSTATE").get("value")
data['__EVENTVALIDATION'] = soup.find(
"input", id="__EVENTVALIDATION").get("value")
r = req.post(url, params=params, data=data)
df = pd.read_html(r.content, attrs={
'id': 'ProjectionBoard1_dg1_ctl00'})[0]
df.drop(df.columns[1], axis=1, inplace=True)
print(df)
df.to_csv("data.csv", index=False)
main("https://www.fangraphs.com/projections.aspx")
Output: view-online

Get data by pages and merge it into one using Python (pagination)

I'm connecting to API which has 500 rows limit per call.
This is my code for a single API call (Works great):
def getdata(data):
auth_token = access_token
hed = {'Authorization': 'Bearer ' + auth_token, 'Accept': 'application/json'}
urlApi = 'https://..../orders?Offset=0&Limit=499'
datar = requests.get(urlApi, data=data, headers=hed, verify=True)
return datar
Now I want to scale it up so it will get me all the records.
This is what I tried to do:
In order to make sure that I have all the rows, I must iterate until there is no more data:
get 1st page
get 2nd page
merge
get 3rd page
merge
etc...
each page is an API call.
This is what I'm trying to do:
def getData(data):
auth_token = access_token
value_offset = 0
hed = {'Authorization': 'Bearer ' + auth_token, 'Accept': 'application/json'}
datarALL = None
while True:
urlApi = 'https://..../orders?Offset=' + value_offset + '&Limit=499'
responsedata = requests.get(urlApi, data=data, headers=hed, verify=True)
if responsedata.ok:
value_offset = value_offset + 499
#to do: merge the result of the get request
datarALL= datarALL+ responsedata (?)
# to do: check if response is empty then break out.
return datarALL
I couldn't find information about how I merge the results of the API calls nor how do I check if I can break the loop.
Edit:
To clear what I'm after.
I can see the results of the API call using:
logger.debug('response is : {0}'.format(datar.json()))
What I want to be able to do:
logger.debug('response is : {0}'.format(datarALL.json()))
and it will show all results from all calls. This requires generate API calls until there is no more data to get.
This is the return sample of API call:
"offset": 0,
"limit": 0,
"total": 0,
"results": [
{
"field1": 0,
"field2": "string",
"field3": "string",
"field4": "string"
}
]
}
In this case, you are almost correct with the idea.
is_valid = True
while is_valid:
is_valid = False
...
...
responsedata = requests.get(urlApi, data=data, headers=hed, verify=True)
if responsedata.status_code == 200: #Use status code to check request status, 200 for successful call
responsedata = responsedata.text
value_offset = value_offset + 499
#to do: merge the result of the get request
jsondata = json.loads(responsedata)
if "results" in jsondata:
if jsondata["results"]:
is_valid = True
if is_valid:
#concat array by + operand
datarALL = datarALL + jsondata["results"]
As I don't know if "results" still exists when the data ran out, so I checked both level.

Python 2.7 BeautifulSoup email scraping stops before end of full database

Hope you are all well! I'm new and using Python 2.7! I'm tring to extract emails from a public available directory website that does not seems to have API: this is the site: http://www.tecomdirectory.com/companies.php?segment=&activity=&search=category&submit=Search
, the code stop gathering email where on the page at the bottom where it says "load more"!
Here is my code:
import requests
import re
from bs4 import BeautifulSoup
file_handler = open('mail.txt','w')
soup = BeautifulSoup(requests.get('http://www.tecomdirectory.com/companies.php?segment=&activity=&search=category&submit=Search').content)
tags = soup('a')
list_new =[]
for tag in tags:
if (re.findall(r'href="mailto:([^"#]+#[^"]+)">\1</a>',('%s'%tag))): list_new = list_new +(re.findall(r'href="mailto:([^"#]+#[^"]+)">\1</a>', ('%s'%tag)))
for x in list_new:
file_handler.write('%s\n'%x)
file_handler.close()
How can i make sure that the code goes till the end of the directory and does not stop where it shows load more?
Thanks.
Warmest regards
You just need to post some data, in particular incrementing group_no to simulate clicking the load more button:
from bs4 import BeautifulSoup
import requests
# you can set whatever here to influence the results
data = {"group_no": "1",
"search": "category",
"segment": "",
"activity": "",
"retail": "",
"category": "",
"Bpark": "",
"alpha": ""}
post = "http://www.tecomdirectory.com/getautocomplete_keyword.php"
with requests.Session() as s:
soup = BeautifulSoup(
s.get("http://www.tecomdirectory.com/companies.php?segment=&activity=&search=category&submit=Search").content,
"html.parser")
print([a["href"] for a in soup.select("a[href^=mailto:]")])
for i in range(1, 5):
data["group_no"] = str(i)
soup = BeautifulSoup(s.post(post, data=data).content, "html.parser")
print([a["href"] for a in soup.select("a[href^=mailto:]")])
To go until the end, you can loop until the post returns no html, that signifies we cannot load any more pages:
def yield_all_mails():
data = {"group_no": "1",
"search": "category",
"segment": "",
"activity": "",
"retail": "",
"category": "",
"Bpark": "",
"alpha": ""}
post = "http://www.tecomdirectory.com/getautocomplete_keyword.php"
start = "http://www.tecomdirectory.com/companies.php?segment=&activity=&search=category&submit=Search"
with requests.Session() as s:
resp = s.get(start)
soup = BeautifulSoup(s.get(start).content, "html.parser")
yield (a["href"] for a in soup.select("a[href^=mailto:]"))
i = 1
while resp.content.strip():
data["group_no"] = str(i)
resp = s.post(post, data=data)
soup = BeautifulSoup(resp.content, "html.parser")
yield (a["href"] for a in soup.select("a[href^=mailto:]"))
i += 1
So if we ran the function like below setting "alpha": "Z" to just iterate over the Z's:
from itertools import chain
for mail in chain.from_iterable(yield_all_mails()):
print(mail)
We would get:
mailto:info#10pearls.com
mailto:fady#24group.ae
mailto:pepe#2heads.tv
mailto:2interact#2interact.us
mailto:gc#worldig.com
mailto:marilyn.pais#3i-infotech.com
mailto:3mgulf#mmm.com
mailto:venkat#4gid.com
mailto:info#4power.biz
mailto:info#4sstudyabroad.com
mailto:fouad#622agency.com
mailto:sahar#7quality.com
mailto:mike.atack#8ack.com
mailto:zyara#emirates.net.ae
mailto:aokasha#zynx.com
Process finished with exit code 0
You should put a sleep in between requests so you don't hammer the server and get yourself blocked.

Categories

Resources