Python - Scrapping Woocommerce does not bring text from price - python

i am working in a price update control between the web from my work and the Tango database (our management/administration system).
Because of that, i have to scrap prices from our web site iwth Python. But
i am having troubles while scraping woocommerce price text. I tried to scrape with requests html and with BeautifulSoup libraries but both brings (direct from source) the "bdi" price text as $0.00:
For example: https://hierroscasanova.com.ar/producto/cano-estructural-redondo/?attribute_pa_medida-1=3&attribute_pa_espesor=2-85&attribute_pa_unidad=kg
Script de requests_html:
from requests_html import HTMLSession
import csv
import time
link = 'https://hierroscasanova.com.ar/producto/cano-estructural-redondo/?attribute_pa_medida-1=3&attribute_pa_espesor=2-85&attribute_pa_unidad=kg'
s = HTMLSession()
r = s.get(link)
#print(r.text)
title = r.html.find('h1', first=True).full_text
price = r.html.find('span.woocommerce-Price-amount.amount bdi')[0].full_text
print(price)
price = r.html.find('span.woocommerce-Price-amount.amount bdi')[1].full_text
print(price)
Result:
$0.00
$0.00
Script de BeautifulSoup:
import requests
from bs4 import BeautifulSoup
page = requests.get("https://hierroscasanova.com.ar/producto/cano-estructural-redondo/?attribute_pa_medida-1=3&attribute_pa_espesor=2-85&attribute_pa_unidad=kg")
soup = BeautifulSoup(page.text, "html.parser")
print(soup)
Result:
<span class="woocommerce-Price-amount amount"><bdi><span class="woocommerce-Price-currencySymbol">$</span>0.00</bdi>
PS: i noticed that when the full web site is download it brings all the data and prices (not $0.00), so i do not know why are the libraries failling.
<div class="woocommerce-variation-price"><span class="price"><span class="woocommerce-Price-amount amount"><bdi><span class="woocommerce-Price-currencySymbol">$</span>325.54</bdi></span> <small class="woocommerce-price-suffix">( IVA incluido )</small></span></div>
Thanks you very much!

You can do it with Selenium. But i show you how to do it with json and bs4.
First we need product id:
def get_id(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, features='lxml')
data_product_id = soup.find('form', class_='variations_form').get('data-product_id')
return data_product_id
Then with this ID, we can get price:
def get_price(product_id, payload):
url = "https://hierroscasanova.com.ar/?wc-ajax=get_variation"
payload = f"{payload}&product_id={product_id}"
headers = {
'accept': '*/*',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
json_data = json.loads(response.text)
return json_data['display_price']
Now remains to prepare the parameters for the link, and we can check:
attribute_pa_medida = '1=3'
attribute_pa_espesor = '2-85'
attribute_pa_unidad = 'kg'
attributes = f'attribute_pa_medida-{attribute_pa_medida}&attribute_pa_espesor={attribute_pa_espesor}&attribute_pa_unidad={attribute_pa_unidad}'
url = f'https://hierroscasanova.com.ar/producto/cano-estructural-redondo/?{attributes}'
print(get_price(get_id(url), attributes))
UPD full code:
import requests
import json
from bs4 import BeautifulSoup
def get_id(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, features='lxml')
data_product_id = soup.find('form', class_='variations_form').get('data-product_id')
return data_product_id
def get_price(product_id, payload):
url = "https://hierroscasanova.com.ar/?wc-ajax=get_variation"
payload = f"{payload}&product_id={product_id}"
headers = {
'accept': '*/*',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
json_data = json.loads(response.text)
return json_data['display_price']
attribute_pa_medida = '1=3'
attribute_pa_espesor = '2-85'
attribute_pa_unidad = 'kg'
attributes = f'attribute_pa_medida-{attribute_pa_medida}&attribute_pa_espesor={attribute_pa_espesor}&attribute_pa_unidad={attribute_pa_unidad}'
url = f'https://hierroscasanova.com.ar/producto/cano-estructural-redondo/?{attributes}'
print(get_price(get_id(url), attributes))

Related

Why doesn't the post request return back any data

I want to get my results from my college's website with python, I typed this script:
import requests
import time
from bs4 import BeautifulSoup
# Make a request to the website
url = 'http://app1.helwan.edu.eg/Commerce/HasasnUpMlist.asp'
response = requests.get(url)
# Parse the response and create a BeautifulSoup object
soup = BeautifulSoup(response.text, 'html.parser')
# Find the input field we need to fill with our ID
input_field = soup.find('input', {'name': 'x_st_settingno', 'id': 'x_st_settingno'})
input_field['value'] = 8936 # Fill in our ID
# Find the submit button and click it
submit_button = soup.find('input', {'name': 'Submit', 'id': 'Submit'})
data = {input_field['name']: input_field['value'], submit_button['name']: submit_button['value']}
response2 = requests.post(url, data=data)
# Parse the response and create a BeautifulSoup object
soup2 = BeautifulSoup(response2.text, 'html.parser')
print(`soup.find('form:nth-of-type(2) table tbody tr:first-of-type td b font')`)
But it always returns None. I do not know why?
The print(soup.find('form:nth-of-type(2) table tbody tr:first-of-type td b font')) part is just the head of the table that contains the link to my results, If I looked for the link it returns None as well.
What am I doing wrong, I am not good with web scraping I just started learning, I hope you can help me guys.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
params = {
"Submit": "%C8%CD%CB",
"x_dep": "",
"x_gro": "",
"x_sec": "",
"x_st_name": "",
"x_st_settingno": "8936",
"z_dep": "=",
"z_gro": "=",
"z_sec": "LIKE",
"z_st_name": "LIKE",
"z_st_settingno": "="
}
r = req.get(url, params=params)
soup = BeautifulSoup(r.content, 'lxml')
res = urljoin(url, soup.select_one('.ewTableRow span.aspmaker a')[
'href'])
r = req.get(res)
df = pd.read_html(r.content)
print(df)
main('http://app1.helwan.edu.eg/Commerce/HasasnUpMlist.asp')

Can't Scrape Dynamically Loaded HTML Table in an Aspx Website

I am trying to scrape some data from the Arizona Medical Board. I search for Anesthesiology in the specialty dropdown list and I find that the table (with the links to the profiles I want to scrape) are dynamically loaded into the website. I notice when hitting the 'specialty search' button, a POST request is made to the server and the html table is actually returned from the server. I have tried simulating this post request to see if I get receive this html table and then try to parse it with bs4. Is this possible, and if so, am I even on the right track?
I have tried to included the form data I found in the network tab of the developer tools but I am not sure if this is the right data, or if I am forgetting some data here or in the header.
Please let me know if I need to clarify, I understand this may not be worded the best. Thank you!
import requests
# import re
import formdata
session = requests.Session()
url = "https://azbomprod.azmd.gov/GLSuiteWeb/Clients/AZBOM/public/WebVerificationSearch.aspx?q=azmd&t=20220622123512"
headers = {'User-Agent': 'My-Agent-Placeholder'}
res = session.get(url, headers=headers)
print("Response: {}".format(res))
payload = {
"__VIEWSTATE": formdata.state,
"__VIEWSTATEGENERATOR": formdata.generator,
"__EVENTVALIDATION" : formdata.validation,
"ctl00%24ContentPlaceHolder1%24Name": 'rbName1',
"ctl00%24ContentPlaceHolder1%24Name": "rbName1",
"ctl00%24ContentPlaceHolder1%24txtLastName" : '',
"ctl00%24ContentPlaceHolder1%24txtFirstName" : '',
"ctl00%24ContentPlaceHolder1%24License": "rbLicense1",
"ctl00%24ContentPlaceHolder1%24txtLicNum": '',
"ctl00%24ContentPlaceHolder1%24Specialty": "rbSpecialty1",
"ctl00%24ContentPlaceHolder1%24ddlSpecialty": '12155',
"ctl00%24ContentPlaceHolder1%24ddlCounty": '15910',
"ctl00%24ContentPlaceHolder1%24txtCity": '',
"__EVENTTARGET": "ctl00%24ContentPlaceHolder1%24btnSpecial",
"__EVENTARGUMENT": ''
}
# params = {"q": "azmd",
# "t": "20220622123512"}
# #url = "https://azbomprod.azmd.gov/GLSuiteWeb/Clients/AZBOM/Public/Results.aspx"
res = session.post(url, data=payload, headers=headers)
print("Post response: {}".format(res))
print(res.text)
# res = requests.get('https://azbomprod.azmd.gov/GLSuiteWeb/Clients/AZBOM/Public/Results.aspx', headers=headers)
Try:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:101.0) Gecko/20100101 Firefox/101.0"
}
url = "https://azbomprod.azmd.gov/GLSuiteWeb/Clients/AZBOM/public/WebVerificationSearch.aspx?q=azmd&t=20220622082816"
with requests.session() as s:
soup = BeautifulSoup(s.get(url, headers=headers).content, "html.parser")
data = {}
for inp in soup.select("input"):
data[inp.get("name")] = inp.get("value", "")
data["ctl00$ContentPlaceHolder1$Name"] = "rbName1"
data["ctl00$ContentPlaceHolder1$License"] = "rbLicense1"
data["ctl00$ContentPlaceHolder1$Specialty"] = "rbSpecialty1"
data["ctl00$ContentPlaceHolder1$ddlSpecialty"] = "12155"
data["ctl00$ContentPlaceHolder1$ddlCounty"] = "15910"
data["__EVENTTARGET"] = "ctl00$ContentPlaceHolder1$btnSpecial"
data["__EVENTARGUMENT"] = ""
soup = BeautifulSoup(
s.post(url, data=data, headers=headers).content, "html.parser"
)
for row in soup.select("tr:has(a)"):
name = row.select("td")[-1].text
link = row.a["href"]
print("{:<35} {}".format(name, link))
Prints:
Abad-Pelsang, Elma A. https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1620623&licID=121089&licType=1
Abadi, Bilal Ibrahim https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1755530&licID=525771&licType=1
Abbasian, Mohammad https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1635449&licID=492537&licType=1
Abdel-Al, Naglaa Z. https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1637612&licID=175204&licType=1
Abedi, Babak https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1641219&licID=169009&licType=1
Abel, Martin D. https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1624271&licID=510929&licType=1
Abenstein, John P. https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1622930&licID=502482&licType=1
...and so on.

How to scrape website tables with varying values depending on the selections

I am trying to scrape:
https://id.investing.com/commodities/gold-historical-data
table from 2010-2020, but the problem is the link between the default date and the date that I chose is still the same. So how can I tell python to scrape data from 2010-2020? please help me I'm using python 3.
This is my code:
import requests, bs4
url = 'https://id.investing.com/commodities/gold-historical-data'
headers = {"User-Agent":"Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = bs4.BeautifulSoup(response.text, 'lxml')
tables = soup.find_all('table')
print(soup)
with open('emasfile.csv','w') as csv:
for row in tables[1].find_all('tr'):
line = ""
for td in row.find_all(['td', 'th']):
line += '"' + td.text + '",'
csv.write(line + '\n')
This page uses JavaScript with AJAX to get data from
https://id.investing.com/instruments/HistoricalDataAjax
It sends POST requests with extra data - start date and end date ("st_date", "end_date")
You can try to use 01/01/2010, 12/31/2020 but I used for-loop to get every year separatelly.
I get all information from DevTool (tab 'Network') in Chrome/Firefox.
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://id.investing.com/instruments/HistoricalDataAjax'
payload = {
"curr_id": "8830",
"smlID": "300004",
"header": "Data+Historis+Emas+Berjangka",
"st_date": "01/30/2020",
"end_date": "12/31/2020",
"interval_sec": "Daily",
"sort_col": "date",
"sort_ord": "DESC",
"action":"historical_data"
}
headers = {
#"Referer": "https://id.investing.com/commodities/gold-historical-data",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0",
"X-Requested-With": "XMLHttpRequest"
}
fh = open('output.csv', 'w')
csv_writer = csv.writer(fh)
for year in range(2010, 2021):
print('year:', year)
payload["st_date"] = f"01/01/{year}"
payload["end_date"] = f"12/31/{year}"
r = requests.post(url, data=payload, headers=headers)
#print(r.text)
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('table')
for row in table.find_all('tr')[1:]: # [1:] to skip header
row_data = [item.text for item in row.find_all('td')]
print(row_data)
csv_writer.writerow(row_data)
fh.close()

How to grab next page url from pagination.

unable to fetch the url for next page. Throws traceback error. Basically i want to grab "/browse-movies?page=2"
from bs4 import BeautifulSoup
import requests
import re
url = "https://yts.ag/browse-movies?page=1"
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('ul', 'tsc_pagination')[0]
for item in items:
print item
You could use range(1, 300) to iterate all pages:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "https://yts.ag/browse-movies?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', 'browse-movie-wrap')
for item in items:
for val in item.find_all('div','browse-movie-bottom'):
title = item.find_all('a','browse-movie-title')[0].text
year = item.find_all('div','browse-movie-year')[0].text
for val in item.find_all('a','browse-movie-link'):
try:
rating = val.find_all('h4')[0].text
genre = val.find_all('h4')[1].text
except:
pass
print year, rating, genre, title
P.S. You might want to add time.sleep(1) to slow down a little bit in case they block your IP for being too aggressive scraping their webpages.
Edit:
Now look for the next page URL, you could use regular expression:
import re
next_page = soup.find('a', text=re.compile(r'.*Next.*'))
print next_page['href']
So what it does is to look for an a tag which has content matches regular expression '.*Next.*'.
urls = ["https://yts.ag/browse-movies?page={}".format(i) for i in range(1, 10)] # make a url list and iterate over it
for url in urls:
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
# your code here
print year, rating, genre, title
Make a URL list and iterate over it. You can change the range.

Python 301 POST

So basically I'm trying to make a request to this website - https://panel.talonro.com/login/ which is supposed to be 301 redirect.
I send data as I should but in the end there is no Location header in my request and status code is 200 instead of 301.
I can't figure out what I am doing wrong. Please help
def do_request():
req = requests.get('https://panel.talonro.com/login/').text
soup = BeautifulSoup(req, 'html.parser')
csrf = soup.find('input', {'name':'csrfKey'}).get('value')
ref = soup.find('input', {'name':'ref'}).get('value')
post_data = {
'auth':'mylogin',
'password':'mypassword',
'login__standard_submitted':'1',
'csrfKey':csrf,
'ref':ref,
'submit':'Go'
}
post = requests.post(url = 'https://forum.talonro.com/login/', data = post_data, headers = {'referer':'https://panel.talonro.com/login/'})
Right now push_data is in do_request(), so you cannot access it outside of that function.
Instead, try this where you return that info and then pass it in:
import requests
from bs4 import BeautifulSoup
def do_request():
req = requests.get('https://panel.talonro.com/login/').text
soup = BeautifulSoup(req, 'html.parser')
csrf = soup.find('input', {'name':'csrfKey'}).get('value')
ref = soup.find('input', {'name':'ref'}).get('value')
post_data = {
'auth':'mylogin',
'password':'mypassword',
'login__standard_submitted':'1',
'csrfKey':csrf,
'ref':ref,
'submit':'Go'
}
return post_data
post = requests.post(url = 'https://forum.talonro.com/login/', data = do_request(), headers = {'referer':'https://panel.talonro.com/login/'})

Categories

Resources