I would like to know your advice on how to optimize my code. More precisely, I want to parse all elements with this class 'value-decrease'.
Code:
import requests
from bs4 import BeautifulSoup
URL = 'https://finance.i.ua/nbu/'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/80.0.3987.163 Safari/537.36', 'accept': '*/*'}
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_="data_container")
currency = []
for item in items:
currency.append({
item.find_all('span', class_='value-decrease').get_text(strip=True)
})
print(f"1 usd = {currency} uah")
def parse():
html = get_html(URL)
if html.status_code == 200:
get_content(html.text)
else:
print('Error')
parse()
The output should be like this(The numbers are approximate.):
1 usd = 27 uah
1 eur = 29 uah
currency is a list that grows with each iteration, so it isn't what you want to print. Also value-decrease is value -descrease (extra space). The span is also nested in another span. Here's the fixes:
import requests
from bs4 import BeautifulSoup
URL = 'https://finance.i.ua/nbu/'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/80.0.3987.163 Safari/537.36', 'accept': '*/*'}
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
rows = soup.find_all('tr')[1:] # find table rows and throw away header
for row in rows:
data = row.find('span',class_='value -decrease') # is the row a decrease?
if data:
currency = row.th.get_text().lower()
value = data.span.get_text()
print(f'1 {currency} = {value} uah')
def parse():
html = get_html(URL)
if html.status_code == 200:
get_content(html.text)
else:
print('Error')
parse()
Output:
1 usd = 27.2022 uah
1 eur = 29.6341 uah
I realized the country changes so I updated my code.
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
currency = []
elems = soup.findAll("span", {"class": "value -decrease"})
countries = [x.parent.previous_sibling.previous_sibling.get_text(strip=True) for x in elems]
for i in range(len(elems)):
cur = elems[i].get_text(strip=True).split('.')[0]
currency.append(cur)
print(f"1 {countries[i]} = {cur} uah")
Related
from bs4 import BeautifulSoup
import requests
searchresults = []
search = 'seo'
url = 'https://www.google.com/search'
headers = {
'Accept' : '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82',
}
parameters = {'q': search}
content = requests.get(url, headers = headers, params = parameters).text
soup = BeautifulSoup(content, 'html.parser')
search = soup.find(id = 'search')
first_link = search.find('a')
searchresults.append(first_link['href'])
for i,j in enumerate(searchresults):
print(searchresults[i])
How do i return the whole search result URL list? I would like to later on add multiple pages soo i can index all the URLs
If you want to get all the links from the search result, replace your code after search = soup.find(id = 'search'):
a_tags = search.find_all('a', href=True)
searchresults = [i['href'] for i in a_tags]
for i,j in enumerate(searchresults):
print(j)
Your code currently gives one 1 link because you are using search.find('a') which gives the first result, instead of search.find_all('a', href=True), which gives all the a tags that have a link.
I have a list of movies that I want to scrap the genres from Google.
I've built this code:
import requests
from bs4 import BeautifulSoup
list=['Se7en','Cinema Paradiso','The Shining','Toy Story 3','Capernaum']
gen2 = {}
for i in list:
user_query = i +'movie genre'
URL = 'https://www.google.co.in/search?q=' + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
c = soup.find(class_='EDblX DAVP1')
print(c)
if c != None:
genres = c.findAll('a')
gen2[i]= genres
But it returns an empty dict, so I checked one by one and it worked, for example:
import requests
from bs4 import BeautifulSoup
user_query = 'Se7en movie genre'
URL = "https://www.google.co.in/search?q=" + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
v = soup.find(class_='KKHQ8c')
h = {}
genres = v.findAll('a')
for genre in genres:
h['Se7en']=genre
So I find out that in the for loop the variable c is returning None.
I can't figure out why! It only return None inside the loop.
Currently, your URLs are of the form
URLs
so the returned results(google) aren't accurate for all the movies.
You can change it to
`for i in list:
i="+".join(i.split(" "));
user_query = i + "+movie+genre"
URL = 'https://www.google.com/search?q=+'+user_query`
also, movies that belong to a single genre like Cinema Paradiso are in a div with class name "Z0LcW".
I am trying to scrape the table from the "https://www.nseindia.com/companies-listing/corporate-filings-event-calendar?days=7days" website, where as the python output is scraping the table.
import requests
from bs4 import BeautifulSoup
url = 'https://www.nseindia.com/companies-listing/corporate-filings-event-calendar?days=7days'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
response = requests.get(url, headers=headers)
print(response)
soup = BeautifulSoup(response.text, 'lxml')
print(soup)
data_array = soup.find(id='table-wrap my-3 borderSet maxHeight-900 scrollWrap').get_text().strip().split(":")
type(data_array)
the output is printing the HTML tag instead of the table.
Regards
karthi
If you want table, there's a download link available. It's available as a csv file. You don't need any code. Why don't you just use that?
this code will return you all table as list, put data_table as locator by xpath:
data_table = self.find_element(table_locator).get_attribute('innerHTML').replace('<th></th>', '')
soup = BeautifulSoup(data_table, 'lxml')
data_rows = soup.find_all('tr')
rows_values_scrape = [[td.getText() for td in data_rows[i].findAll('td')]
for i, v in enumerate(data_rows)]
rows_values = [x for x in rows_values_scrape if x]
columns_scrape = [[td.getText() for td in data_rows[i].findAll('th')]
for i, v in enumerate(data_rows)]
columns = [x for x in columns_scrape if x]
table=[]
if columns[1:] != []:
for i, r in enumerate(columns[1:]):
table.append([f'column: {columns[0][j]}, row_title: {columns[1:][i][0]}, cell: {rows_values[i][j]}' for j, c in enumerate(columns[0])])
else:
table=[f'column: {columns[0][j]}, cell: {rows_values[0][j]}' for j, c in enumerate(columns[0]) if columns[1:] == []]
return table
I am trying to gather some information about some books available on Amazon and I am having a weird glitch error that I can't understand. At first I thought it was Amazon blocking my connection but then I noticed the request has a "200 OK" and it had the real HTML content of the corresponding page.
Let's take for example this book: https://www.amazon.co.uk/All-Rage-Cara-Hunter/dp/0241985110
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
url = 'https://www.amazon.co.uk/All-Rage-Cara-Hunter/dp/0241985110/ref=sr_1_1?crid=2PPCQEJD706VY&dchild=1&keywords=books+bestsellers+2020+paperback&qid=1598132071&sprefix=book%2Caps%2C234&sr=8-1'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
price = {}
if soup.select("#buyBoxInner > ul > li > span > .a-text-strike") != []:
price["regular_price"] = float(
soup.select("#buyBoxInner > ul > li > span > .a-text-strike")[0].string[1:].replace(",", "."))
price["promo_price"] = float(soup.select(".offer-price")[0].string[1:].replace(",", "."))
else:
price["regular_price"] = float(soup.select(".offer-price")[0].string[1:].replace(",", "."))
price["currency"] = soup.select(".offer-price")[0].string[0]
This part works fine and I can have the regular price and a promo price (if exists), and even the currency. But when I do this:
isbn = soup.select("td.bucket > .content > ul > li")[4].contents[1].string.strip().replace("-", "")
I get "IndexError: list index out of range". But if I debug the code, the content is actually there!
Is this a bug of BeautifulSoup? Is the request response too long?
It seems that Amazon returns two version of the page. One where's <td class="bucket"> and one where are several <span> tags. This script tries to extract ISBN from both of them:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
url = 'https://www.amazon.co.uk/All-Rage-Cara-Hunter/dp/0241985110'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
isbn_10 = soup.select_one('span.a-text-bold:contains("ISBN-10"), b:contains("ISBN-10")').find_parent().text
isbn_13 = soup.select_one('span.a-text-bold:contains("ISBN-13"), b:contains("ISBN-13")').find_parent().text
print(isbn_10.split(':')[-1].strip())
print(isbn_13.split(':')[-1].strip())
Prints:
0241985110
978-0241985113
I wish I had an explanation of the problem but you a solution would be to wrap your code in a function like so:
def scrape():
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
url = 'https://www.amazon.co.uk/All-Rage-Cara-Hunter/dp/0241985110/ref=sr_1_1?crid=2PPCQEJD706VY&dchild=1&keywords=books+bestsellers+2020+paperback&qid=1598132071&sprefix=book%2Caps%2C234&sr=8-1'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
price = {}
if soup.select("#buyBoxInner > ul > li > span > .a-text-strike") != []:
price["regular_price"] = float(
soup.select("#buyBoxInner > ul > li > span > .a-text-strike")[0].string[1:].replace(",", "."))
price["promo_price"] = float(soup.select(".offer-price")[0].string[1:].replace(",", "."))
else:
price["regular_price"] = float(soup.select(".offer-price")[0].string[1:].replace(",", "."))
price["currency"] = soup.select(".offer-price")[0].string[0]
#ADD THIS FEATURE TO YOUR CODE
isbn = soup.select("td.bucket > .content > ul > li")
if not isbn:
scrape()
isbn = isbn[4].contents[1].string.strip().replace("-", "")
Then if it fails it will just call itself again. You might want to refactor it so it only makes the request once.
I'm trying to detect the availability of an item on Amazon. Why doesn't this code work?
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from time import sleep
import time
from lxml import html
import json
def check(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
page = requests.get(url, headers = headers)
for i in range(20):
sleep(3)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[#id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
return AVAILABILITY
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
html = req.get(i)
doc = SimplifiedDoc(html)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
#soup = BeautifulSoup(html, "lxml")
soup = BeautifulSoup(response.content, features="lxml")
title = soup.select("#productTitle")[0].get_text().strip()
if check(i) == 'In stock.':
price = soup.select("#priceblock_saleprice")[0].get_text()
else:
price = "UNAVAILABLE"
review_count = int(soup.select("#acrCustomerReviewText")[0].get_text().split()[0])
jsonObject = {'title': title, 'price': price, 'review_count': review_count}
print json.dumps(jsonObject, indent=2)
print "////////////////////////////////////////////////"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
When I execute it, this error appears:
File "scra.py", line 17, in check
doc = html.fromstring(page.content)
AttributeError: 'unicode' object has no attribute 'fromstring'
Please help me. I already tried converting page to pagedata = page.json() but it only made it worse.
Try using this instead of html.fromstring
doc = BeautifulSoup(page.content, 'html.parser')
doc = doc.prettify()