Python (Post) submit a form - python

I am teaching myself with submitting a form on the web
but somehow post is not working.
The url is https://courselist.wm.edu/courselist/
and the code so far is:
from bs4 import BeautifulSoup
import requests
import urllib
import re
url = 'http://courselist.wm.edu/courselist'
with requests.Session() as session:
response = session.get(url)
soup = BeautifulSoup(response.content)
data = {
'term_code' : '201530',
'term_subj' : 'AFST',
'attr' : '0',
'levl' : '0',
'status' : '0'
}
r = session.post(url, data=data)
#response = session.post(url, data=data)
print r.content
#soup = BeautifulSoup(response.content)
#for row in soup.select('table'):
# print [td.text for td in row.find_all('td')]

You cannot submit a form with Beautifulsoup. For this you should use Mechanize. See here an example of how to use it for form submitting.

Related

Can't Scrape Dynamically Loaded HTML Table in an Aspx Website

I am trying to scrape some data from the Arizona Medical Board. I search for Anesthesiology in the specialty dropdown list and I find that the table (with the links to the profiles I want to scrape) are dynamically loaded into the website. I notice when hitting the 'specialty search' button, a POST request is made to the server and the html table is actually returned from the server. I have tried simulating this post request to see if I get receive this html table and then try to parse it with bs4. Is this possible, and if so, am I even on the right track?
I have tried to included the form data I found in the network tab of the developer tools but I am not sure if this is the right data, or if I am forgetting some data here or in the header.
Please let me know if I need to clarify, I understand this may not be worded the best. Thank you!
import requests
# import re
import formdata
session = requests.Session()
url = "https://azbomprod.azmd.gov/GLSuiteWeb/Clients/AZBOM/public/WebVerificationSearch.aspx?q=azmd&t=20220622123512"
headers = {'User-Agent': 'My-Agent-Placeholder'}
res = session.get(url, headers=headers)
print("Response: {}".format(res))
payload = {
"__VIEWSTATE": formdata.state,
"__VIEWSTATEGENERATOR": formdata.generator,
"__EVENTVALIDATION" : formdata.validation,
"ctl00%24ContentPlaceHolder1%24Name": 'rbName1',
"ctl00%24ContentPlaceHolder1%24Name": "rbName1",
"ctl00%24ContentPlaceHolder1%24txtLastName" : '',
"ctl00%24ContentPlaceHolder1%24txtFirstName" : '',
"ctl00%24ContentPlaceHolder1%24License": "rbLicense1",
"ctl00%24ContentPlaceHolder1%24txtLicNum": '',
"ctl00%24ContentPlaceHolder1%24Specialty": "rbSpecialty1",
"ctl00%24ContentPlaceHolder1%24ddlSpecialty": '12155',
"ctl00%24ContentPlaceHolder1%24ddlCounty": '15910',
"ctl00%24ContentPlaceHolder1%24txtCity": '',
"__EVENTTARGET": "ctl00%24ContentPlaceHolder1%24btnSpecial",
"__EVENTARGUMENT": ''
}
# params = {"q": "azmd",
# "t": "20220622123512"}
# #url = "https://azbomprod.azmd.gov/GLSuiteWeb/Clients/AZBOM/Public/Results.aspx"
res = session.post(url, data=payload, headers=headers)
print("Post response: {}".format(res))
print(res.text)
# res = requests.get('https://azbomprod.azmd.gov/GLSuiteWeb/Clients/AZBOM/Public/Results.aspx', headers=headers)
Try:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:101.0) Gecko/20100101 Firefox/101.0"
}
url = "https://azbomprod.azmd.gov/GLSuiteWeb/Clients/AZBOM/public/WebVerificationSearch.aspx?q=azmd&t=20220622082816"
with requests.session() as s:
soup = BeautifulSoup(s.get(url, headers=headers).content, "html.parser")
data = {}
for inp in soup.select("input"):
data[inp.get("name")] = inp.get("value", "")
data["ctl00$ContentPlaceHolder1$Name"] = "rbName1"
data["ctl00$ContentPlaceHolder1$License"] = "rbLicense1"
data["ctl00$ContentPlaceHolder1$Specialty"] = "rbSpecialty1"
data["ctl00$ContentPlaceHolder1$ddlSpecialty"] = "12155"
data["ctl00$ContentPlaceHolder1$ddlCounty"] = "15910"
data["__EVENTTARGET"] = "ctl00$ContentPlaceHolder1$btnSpecial"
data["__EVENTARGUMENT"] = ""
soup = BeautifulSoup(
s.post(url, data=data, headers=headers).content, "html.parser"
)
for row in soup.select("tr:has(a)"):
name = row.select("td")[-1].text
link = row.a["href"]
print("{:<35} {}".format(name, link))
Prints:
Abad-Pelsang, Elma A. https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1620623&licID=121089&licType=1
Abadi, Bilal Ibrahim https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1755530&licID=525771&licType=1
Abbasian, Mohammad https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1635449&licID=492537&licType=1
Abdel-Al, Naglaa Z. https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1637612&licID=175204&licType=1
Abedi, Babak https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1641219&licID=169009&licType=1
Abel, Martin D. https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1624271&licID=510929&licType=1
Abenstein, John P. https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1622930&licID=502482&licType=1
...and so on.

Python - Scrapping Woocommerce does not bring text from price

i am working in a price update control between the web from my work and the Tango database (our management/administration system).
Because of that, i have to scrap prices from our web site iwth Python. But
i am having troubles while scraping woocommerce price text. I tried to scrape with requests html and with BeautifulSoup libraries but both brings (direct from source) the "bdi" price text as $0.00:
For example: https://hierroscasanova.com.ar/producto/cano-estructural-redondo/?attribute_pa_medida-1=3&attribute_pa_espesor=2-85&attribute_pa_unidad=kg
Script de requests_html:
from requests_html import HTMLSession
import csv
import time
link = 'https://hierroscasanova.com.ar/producto/cano-estructural-redondo/?attribute_pa_medida-1=3&attribute_pa_espesor=2-85&attribute_pa_unidad=kg'
s = HTMLSession()
r = s.get(link)
#print(r.text)
title = r.html.find('h1', first=True).full_text
price = r.html.find('span.woocommerce-Price-amount.amount bdi')[0].full_text
print(price)
price = r.html.find('span.woocommerce-Price-amount.amount bdi')[1].full_text
print(price)
Result:
$0.00
$0.00
Script de BeautifulSoup:
import requests
from bs4 import BeautifulSoup
page = requests.get("https://hierroscasanova.com.ar/producto/cano-estructural-redondo/?attribute_pa_medida-1=3&attribute_pa_espesor=2-85&attribute_pa_unidad=kg")
soup = BeautifulSoup(page.text, "html.parser")
print(soup)
Result:
<span class="woocommerce-Price-amount amount"><bdi><span class="woocommerce-Price-currencySymbol">$</span>0.00</bdi>
PS: i noticed that when the full web site is download it brings all the data and prices (not $0.00), so i do not know why are the libraries failling.
<div class="woocommerce-variation-price"><span class="price"><span class="woocommerce-Price-amount amount"><bdi><span class="woocommerce-Price-currencySymbol">$</span>325.54</bdi></span> <small class="woocommerce-price-suffix">( IVA incluido )</small></span></div>
Thanks you very much!
You can do it with Selenium. But i show you how to do it with json and bs4.
First we need product id:
def get_id(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, features='lxml')
data_product_id = soup.find('form', class_='variations_form').get('data-product_id')
return data_product_id
Then with this ID, we can get price:
def get_price(product_id, payload):
url = "https://hierroscasanova.com.ar/?wc-ajax=get_variation"
payload = f"{payload}&product_id={product_id}"
headers = {
'accept': '*/*',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
json_data = json.loads(response.text)
return json_data['display_price']
Now remains to prepare the parameters for the link, and we can check:
attribute_pa_medida = '1=3'
attribute_pa_espesor = '2-85'
attribute_pa_unidad = 'kg'
attributes = f'attribute_pa_medida-{attribute_pa_medida}&attribute_pa_espesor={attribute_pa_espesor}&attribute_pa_unidad={attribute_pa_unidad}'
url = f'https://hierroscasanova.com.ar/producto/cano-estructural-redondo/?{attributes}'
print(get_price(get_id(url), attributes))
UPD full code:
import requests
import json
from bs4 import BeautifulSoup
def get_id(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, features='lxml')
data_product_id = soup.find('form', class_='variations_form').get('data-product_id')
return data_product_id
def get_price(product_id, payload):
url = "https://hierroscasanova.com.ar/?wc-ajax=get_variation"
payload = f"{payload}&product_id={product_id}"
headers = {
'accept': '*/*',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
json_data = json.loads(response.text)
return json_data['display_price']
attribute_pa_medida = '1=3'
attribute_pa_espesor = '2-85'
attribute_pa_unidad = 'kg'
attributes = f'attribute_pa_medida-{attribute_pa_medida}&attribute_pa_espesor={attribute_pa_espesor}&attribute_pa_unidad={attribute_pa_unidad}'
url = f'https://hierroscasanova.com.ar/producto/cano-estructural-redondo/?{attributes}'
print(get_price(get_id(url), attributes))

Script fails to generate results

I've written a script in python to scrape the result populated upon filling in two inputboxes zipcode and distance with 66109,10000. When I try the inputs manually, the site does display results but when I try the same using the script I get nothing. The script throws no error either. What might be the issues here?
Website link
I've tried with:
import requests
from bs4 import BeautifulSoup
url = 'https://www.sart.org/clinic-pages/find-a-clinic/'
payload = {
'zip': '66109',
'strdistance': '10000',
'SelectedState': 'Select State or Region'
}
def get_clinics(link):
session = requests.Session()
response = session.post(link,data=payload,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(response.text,"lxml")
item = soup.select_one(".clinics__search-meta").text
print(item)
if __name__ == '__main__':
get_clinics(url)
I'm only after this line Within 10000 miles of 66109 there are 383 clinics. generated when the search is made.
I changed the url and the requests method to GET and worked for me
def get_clinics(link):
session = requests.Session()
response = session.get(link, headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(response.text,"lxml")
item = soup.select_one(".clinics__search-meta").text
print(item)
url = 'https://www.sart.org/clinic-pages/find-a-clinic?zip=66109&strdistance=10000&SelectedState=Select+State+or+Region'
get_clinics(url)
Include cookies is one of the main concern here. If you do it in the right way, you can get a valid response following the way you started. Here is the working code:
import requests
from bs4 import BeautifulSoup
url = 'https://www.sart.org/clinic-pages/find-a-clinic/'
payload = {
'zip': '66109',
'strdistance': '10000',
'SelectedState': 'Select State or Region'
}
def get_clinics(link):
with requests.Session() as s:
res = s.get(link)
req = s.post(link,data=payload,cookies=res.cookies.get_dict())
soup = BeautifulSoup(req.text,"lxml")
item = soup.select_one(".clinics__search-meta").get_text(strip=True)
print(item)
if __name__ == '__main__':
get_clinics(url)

Trouble fetching results from next pages using post requests

I've written a script in python to get the tabular data populated upon filling in two input boxes (From and Through) located at the top right corner of a webpage. The date I filled in to generate results are 08/28/2017 and 11/25/2018.
When I run my following script, I can get the tabular results from it's first page.
However, the data have spread across multiple pages through pagination and the url remains unchanged. How can I get the next page content?
Url to the site
This is my attempt:
import requests
from bs4 import BeautifulSoup
url = "https://www.myfloridalicense.com/FLABTBeerPricePosting/"
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
try:
evtrgt = soup.select_one("#__EVENTTARGET").get('value')
except AttributeError: evtrgt = ""
viewstate = soup.select_one("#__VIEWSTATE").get('value')
viewgen = soup.select_one("#__VIEWSTATEGENERATOR").get('value')
eventval = soup.select_one("#__EVENTVALIDATION").get('value')
payload = {
'__EVENTTARGET': evtrgt,
'__EVENTARGUMENT': '',
'__VIEWSTATE':viewstate,
'__VIEWSTATEGENERATOR':viewgen,
'__VIEWSTATEENCRYPTED':'',
'__EVENTVALIDATION':eventval,
'ctl00$MainContent$txtPermitNo':'',
'ctl00$MainContent$txtPermitName': '',
'ctl00$MainContent$txtBrandName':'',
'ctl00$MainContent$txtPeriodBeginDt':'08/28/2017',
'ctl00$MainContent$txtPeriodEndingDt':'11/25/2018',
'ctl00$MainContent$btnSearch': 'Search'
}
with requests.Session() as s:
s.headers["User-Agent"] = "Mozilla/5.0"
req = s.post(url,data=payload,cookies=res.cookies.get_dict())
sauce = BeautifulSoup(req.text,"lxml")
for items in sauce.select("#MainContent_gvBRCSummary tr"):
data = [item.get_text(strip=True) for item in items.select("th,td")]
print(data)
Any help to solve the issue will be highly appreciated. Once again: the data I wish to grab are the tabular content from the site's next pages as my script can already parse the data from it's first page?
P.S.: Browser simulator is not an option I would like to cope with.
You need to add a loop for each page and assign the requested page number to the __EVENTARGUMENT parameter as follows:
import requests
from bs4 import BeautifulSoup
url = "https://www.myfloridalicense.com/FLABTBeerPricePosting/"
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
try:
evtrgt = soup.select_one("#__EVENTTARGET").get('value')
except AttributeError:
evtrgt = ""
viewstate = soup.select_one("#__VIEWSTATE").get('value')
viewgen = soup.select_one("#__VIEWSTATEGENERATOR").get('value')
eventval = soup.select_one("#__EVENTVALIDATION").get('value')
payload = {
'__EVENTTARGET' : evtrgt,
'__EVENTARGUMENT' : '',
'__VIEWSTATE' : viewstate,
'__VIEWSTATEGENERATOR' : viewgen,
'__VIEWSTATEENCRYPTED' : '',
'__EVENTVALIDATION' : eventval,
'ctl00$MainContent$txtPermitNo' : '',
'ctl00$MainContent$txtPermitName' : '',
'ctl00$MainContent$txtBrandName' : '',
'ctl00$MainContent$txtPeriodBeginDt' : '08/28/2017',
'ctl00$MainContent$txtPeriodEndingDt' : '11/25/2018',
'ctl00$MainContent$btnSearch': 'Search'
}
for page in range(1, 12):
with requests.Session() as s:
s.headers["User-Agent"] = "Mozilla/5.0"
payload['__EVENTARGUMENT'] = f'Page${page}'
req = s.post(url,data=payload,cookies=res.cookies.get_dict())
sauce = BeautifulSoup(req.text, "lxml")
for items in sauce.select("#MainContent_gvBRCSummary tr"):
data = [item.get_text(strip=True) for item in items.select("th,td")]
print(data)

Python 301 POST

So basically I'm trying to make a request to this website - https://panel.talonro.com/login/ which is supposed to be 301 redirect.
I send data as I should but in the end there is no Location header in my request and status code is 200 instead of 301.
I can't figure out what I am doing wrong. Please help
def do_request():
req = requests.get('https://panel.talonro.com/login/').text
soup = BeautifulSoup(req, 'html.parser')
csrf = soup.find('input', {'name':'csrfKey'}).get('value')
ref = soup.find('input', {'name':'ref'}).get('value')
post_data = {
'auth':'mylogin',
'password':'mypassword',
'login__standard_submitted':'1',
'csrfKey':csrf,
'ref':ref,
'submit':'Go'
}
post = requests.post(url = 'https://forum.talonro.com/login/', data = post_data, headers = {'referer':'https://panel.talonro.com/login/'})
Right now push_data is in do_request(), so you cannot access it outside of that function.
Instead, try this where you return that info and then pass it in:
import requests
from bs4 import BeautifulSoup
def do_request():
req = requests.get('https://panel.talonro.com/login/').text
soup = BeautifulSoup(req, 'html.parser')
csrf = soup.find('input', {'name':'csrfKey'}).get('value')
ref = soup.find('input', {'name':'ref'}).get('value')
post_data = {
'auth':'mylogin',
'password':'mypassword',
'login__standard_submitted':'1',
'csrfKey':csrf,
'ref':ref,
'submit':'Go'
}
return post_data
post = requests.post(url = 'https://forum.talonro.com/login/', data = do_request(), headers = {'referer':'https://panel.talonro.com/login/'})

Categories

Resources