How to scrape only one price? - python

I'm trying to scrape product prices from a website and both real price and the monthly payment quota value has exactly the same class, so I can't figure it out how to only get main price.
and this is for the main price: "879.990"
this is for the monthly payment quota: "39.990",
this is the URL: https://listado.mercadolibre.cl/macbook#D[A:macbook]
#THIS GETS ALL THE NAMES AND STORES IT IN A LIST
prod = soup.find_all('h2', class_ ='ui-search-item__title shops__item-title')
productos = list()
count=0
for i in prod:
if count < 33:
productos.append(i.text)
else:
break
count +=1
size= len(productos) +1
#print(size)
#print(productos, len(productos))
print(productos)
#THIS GETS ALL THE NAMES AND STORES IT IN A LIST
pri = soup.find_all('span',class_ ="price-tag-fraction")
precios = list()
count=0
for i in pri:
if count < 33:
precios.append(i.text)
else:
break
count +=1
#rint(precios)
prices= [item.split(',')for item in precios]
Here is the output

You can filter out the other prices using CSS selectors
# filsel = 'span.price-tag-fraction:not(span.ui-search-installments span):not(s.price-tag__disabled span)'
emiSp_sel = 'span.ui-search-installments span' # monthly
disab_sel = 's.price-tag__disabled span' # crossed out
filsel = f'span.price-tag-fraction:not({emiSp_sel}):not({disab_sel})'
pri = [p.get_text() for p in soup.select(filsel)]
or using lambda with find
pri = soup.find_all(
lambda p: p.name == 'span' and 'price-tag-fraction' in p.get('class', '')
and p.find_parent('span', {'class': 'ui-search-installments'}) is None
and p.find_parent('s', {'class': 'price-tag__disabled'}) is None
)
or even by combining lists comprehension with your current method
pri = [
p for p in soup.find_all('span',class_ ="price-tag-fraction")
if p.find_parent('span', {'class': 'ui-search-installments'}) is None
and p.find_parent('s', {'class': 'price-tag__disabled'}) is None
]

Related

Scraping same class Selenium

I want to scrape Home team and Away team from this page https://www.flashscore.com/match/hY5c1Bhh/#match-summary/match-summary
# Get HomeTeam
_ht = driver.find_element_by_xpath('//*[contains(#class, "home")]')
ht = _ht.find_element_by_xpath('//*[contains(#class, "participantName")]')
_homeName = ht.text
# Get AwayTeam
_at = driver.find_element_by_xpath('//*[contains(#class, "away")]')
at = _at.find_element_by_xpath('//*[contains(#class, "participantName")]')
_awayName = at.text
Output
Longford
Longford
try to store both of them in a list like this :
teams = driver.find_elements(By.CSS_SELECTOR, "div[class^='participantName'] a")
print("Home team : ", teams[0].text)
print("Away team : ", teams[1].text)
You are missing the . when trying to locate element inside other element.
So your code should be
# Get HomeTeam
_ht = driver.find_element_by_xpath('//*[contains(#class, "home")]')
ht = _ht.find_element_by_xpath('.//*[contains(#class, "participantName")]')
_homeName = ht.text
# Get AwayTeam
_at = driver.find_element_by_xpath('//*[contains(#class, "away")]')
at = _at.find_element_by_xpath('.//*[contains(#class, "participantName")]')
_awayName = at.text

How can my program return a none for values not available, e.g some movies don't have metascore

Program suppose to return values for all 50 movies for its title, Metascore, genre, gross and if not available return aa none to ensure all elements in the respective list are 50 but currently give out 43 elements.
url = requests.get(f'https://www.imdb.com/search/title/?title_type=feature&year=2017-01-01,2017-12-31&start=51&ref_=adv_nxt')
soup = BeautifulSoup(url.text, 'html.parser')
for t, m, g, r, c, i in zip(soup.select('div.lister-list >div.lister-item>div.lister-item-content>h3.lister-item-header>a'),
soup.select('div.lister-list >div.lister-item>div.lister-item-content>div.ratings-bar>div.ratings-metascore>span'),
soup.select('div.lister-list >div.lister-item>div.lister-item-content>p.text-muted>.genre'),
soup.select('div.lister-list >div.lister-item>div.lister-item-content>p.text-muted>.runtime'),
soup.select('div.lister-list >div.lister-item>div.lister-item-content>p.text-muted>.certificate'),
soup.select('div.lister-list >div.lister-item>div.lister-item-content>div.ratings-bar>div>strong')):
title.append(t.text)
metascore.append(m.getText())
genre.append(g.text.strip())
run_time.append(r.text)
m_certificate.append(c.text)
imdb_rating.append(i.text)
For loops return None value to values not present
for v in soup.select('div.lister-item-content >p.sort-num_votes-visible'):
votes.append(v.find('span', attrs = {'name':'nv'}).text)
vote = v.find_all('span', attrs={'name': 'nv'})
try:
gross.append(vote[1].text)
except IndexError:
gross.append(None)
Some movies don't have metascore and some of them don't have certificate either. You either go for try-except blocks or conditional statements to get rid of that error. I used the latter within the following example. Give it a shot:
import requests
from bs4 import BeautifulSoup
link = 'https://www.imdb.com/search/title/?title_type=feature&year=2017-01-01,2017-12-31&start=51&ref_=adv_nxt'
res = requests.get(link)
soup = BeautifulSoup(res.text, 'html.parser')
for item in soup.select(".lister-item"):
name = item.select_one('h3.lister-item-header > a').get_text(strip=True)
score = item.select_one('span.metascore').get_text(strip=True) if item.select_one('span.metascore') else None
genre = item.select_one('span.genre').get_text(strip=True) if item.select_one('span.genre') else None
runtime = item.select_one('span.runtime').get_text(strip=True) if item.select_one('span.runtime') else None
certificate = item.select_one('span.certificate').get_text(strip=True) if item.select_one('span.certificate') else None
rating = item.select_one('.rating-star + strong').get_text(strip=True) if item.select_one('.rating-star + strong') else None
print(name,score,genre,runtime,certificate,rating)

combine two for loops in to fill same dictionary

I am trying to get two different merchants from a list of dictionaries with priority to merchants who have prices,if no two different merchants are found with prices, merchant 1 or 2 prices are to be filled with data from list,if list is not enough merchant 1 or 2 should be None.
I.e the for loop will return two merchants,priority to merchants who have prices, if that is not enough to fill merchants (1 or 2) get merchants with no prices.finally if still merchant 1 or 2 not created fill them with None value.
here is the code I have so far, it does the job but I believe it can be combined in a more Pythonic way.
import csv
with open('/home/timmy/testing/example/example/test.csv') as csvFile:
reader=csv.DictReader(csvFile)
for row in reader:
dummy_list.append(row)
item=dict()
index = 1
for merchant in dummy_list:
if merchant['price']:
if index==2:
if item['merchant_1']==merchant['name']:
continue
item['merchant_%d'%index] = merchant['name']
item['merchant_%d_price'%index] = merchant['price']
item['merchant_%d_stock'%index] = merchant['stock']
item['merchant_%d_link'%index] = merchant['link']
if index==3:
break
index+=1
for merchant in dummy_list:
if index==3:
break
if index<3:
try:
if item['merchant_1']==merchant['name']:
continue
except KeyError:
pass
item['merchant_%d'%index] = merchant['name']
item['merchant_%d_price'%index] = merchant['price']
item['merchant_%d_stock'%index] = merchant['stock']
item['merchant_%d_link'%index] = merchant['link']
index+=1
while index<3:
item['merchant_%d'%index] = ''
item['merchant_%d_price'%index] = ''
item['merchant_%d_stock'%index] = ''
item['merchant_%d_link'%index] = ''
index+=1
print(item)
here is the contents of the csv file:
price,link,name,stock
,https://www.samsclub.com/sams/donut-shop-100-ct-k-cups/prod19381344.ip,Samsclub,
,https://www.costcobusinessdelivery.com/Green-Mountain-Original-Donut-Shop-Coffee%2C-Medium%2C-Keurig-K-Cup-Pods%2C-100-ct.product.100297848.html,Costcobusinessdelivery,
,https://www.costco.com/The-Original-Donut-Shop%2C-Medium-Roast%2C-K-Cup-Pods%2C-100-count.product.100381350.html,Costco,
57.99,https://www.target.com/p/the-original-donut-shop-regular-medium-roast-coffee-keurig-k-cup-pods-108ct/-/A-13649874,Target,Out of Stock
10.99,https://www.target.com/p/the-original-donut-shop-dark-roast-coffee-keurig-k-cup-pods-18ct/-/A-16185668,Target,In Stock
,https://www.homedepot.com/p/Keurig-Kcup-Pack-The-Original-Donut-Shop-Coffee-108-Count-110030/204077166,Homedepot,Undertermined
As you only want to keep at most 2 merchants, I would process the csv only once keeping separately a list of merchant with prices and a list of merchant without prices, stopping as soon as 2 merchant with prices have been found.
After that loop, I would concatenate those 2 list and a list of two empty merchants and take the first 2 elements of that. That will be enough to guarantee your requirements of 2 distinct merchants with priority to those having prices. Finaly, I would use that to fill the item dict.
Code would be:
import csv
with open('/home/timmy/testing/example/example/test.csv') as csvFile:
reader=csv.DictReader(csvFile)
names_price = set()
names_no_price = set()
merchant_price = []
merchant_no_price = []
item = {}
for merchant in reader:
if merchant['price']:
if not merchant['name'] in names_price:
names_price.add(merchant['name'])
merchant_price.append(merchant)
if len(merchant_price) == 2:
break;
else:
if not merchant['name'] in names_no_price:
names_no_price.add(merchant['name'])
merchant_no_price.append(merchant)
void = { k: '' for k in reader.fieldnames}
merchant_list = (merchant_price + merchant_no_price + [void, void.copy()])[:2]
for index, merchant in enumerate(merchant_list, 1):
item['merchant_%d'%index] = merchant['name']
item['merchant_%d_price'%index] = merchant['price']
item['merchant_%d_stock'%index] = merchant['stock']
item['merchant_%d_link'%index] = merchant['link']

How to limit number of rows that fill a dataframe in a for loop

I have written the following function that scrapes multiple pages from a website. I only want to get the first 20 or so pages. How can I limit the number of rows that I fill in my dataframe:
def scrape_page(poi,page_name):
base_url="https://www.fake_website.org/"
report_url=(base_url+poi)
page=urlopen(report_url)
experiences=BeautifulSoup(page,"html.parser")
empty_list=[]
for link in experiences.findAll('a', attrs={'href': re.compile(page_name+".shtml$")}):
url=urljoin(base_url, link.get("href"))
subpage=urlopen(url)
expages=BeautifulSoup(subpage, "html.parser")
for report in expages.findAll('a', attrs={'href': re.compile("^/experiences/exp")}):
url=urljoin(base_url, report.get("href"))
reporturlopen=urlopen(url)
reporturl=BeautifulSoup(reporturlopen, "html.parser")
book_title= reporturl.findAll("div",attrs={'class':'title'})
for i in book_title:
title=i.get_text()
book_genre= reporturl.findAll("div",attrs={'class':'genre'})
for i in book_genre:
genre=i.get_text()
book_author= reporturl.findAll("div",attrs={'class':'author'})
for i in book_author:
author=i.get_text()
author = re.sub("by", "",author)
empty_list.append({'title':title,'genre':genre,'author':author})
setattr(sys.modules[__name__], '{}_df'.format(poi+"_"+page_name), empty_list)
You can for example add a while loop:
i = 0
while i < 20:
< insert your code >
i += 1

Generate a table of contents from HTML with Python

I'm trying to generate a table of contents from a block of HTML (not a complete file - just content) based on its <h2> and <h3> tags.
My plan so far was to:
Extract a list of headers using beautifulsoup
Use a regex on the content to place anchor links before/inside the header tags (so the user can click on the table of contents) -- There might be a method for replacing inside beautifulsoup?
Output a nested list of links to the headers in a predefined spot.
It sounds easy when I say it like that, but it's proving to be a bit of a pain in the rear.
Is there something out there that does all this for me in one go so I don't waste the next couple of hours reinventing the wheel?
A example:
<p>This is an introduction</p>
<h2>This is a sub-header</h2>
<p>...</p>
<h3>This is a sub-sub-header</h3>
<p>...</p>
<h2>This is a sub-header</h2>
<p>...</p>
Some quickly hacked ugly piece of code:
soup = BeautifulSoup(html)
toc = []
header_id = 1
current_list = toc
previous_tag = None
for header in soup.findAll(['h2', 'h3']):
header['id'] = header_id
if previous_tag == 'h2' and header.name == 'h3':
current_list = []
elif previous_tag == 'h3' and header.name == 'h2':
toc.append(current_list)
current_list = toc
current_list.append((header_id, header.string))
header_id += 1
previous_tag = header.name
if current_list != toc:
toc.append(current_list)
def list_to_html(lst):
result = ["<ul>"]
for item in lst:
if isinstance(item, list):
result.append(list_to_html(item))
else:
result.append('<li>%s</li>' % item)
result.append("</ul>")
return "\n".join(result)
# Table of contents
print list_to_html(toc)
# Modified HTML
print soup
Use lxml.html.
It can deal with invalid html just fine.
It is very fast.
It allows you to easily create the missing elements and move elements around between the trees.
I have come with an extended version of the solution proposed by Ɓukasz's.
def list_to_html(lst):
result = ["<ul>"]
for item in lst:
if isinstance(item, list):
result.append(list_to_html(item))
else:
result.append('<li>{}</li>'.format(item[0], item[1]))
result.append("</ul>")
return "\n".join(result)
soup = BeautifulSoup(article, 'html5lib')
toc = []
h2_prev = 0
h3_prev = 0
h4_prev = 0
h5_prev = 0
for header in soup.findAll(['h2', 'h3', 'h4', 'h5', 'h6']):
data = [(slugify(header.string), header.string)]
if header.name == "h2":
toc.append(data)
h3_prev = 0
h4_prev = 0
h5_prev = 0
h2_prev = len(toc) - 1
elif header.name == "h3":
toc[int(h2_prev)].append(data)
h3_prev = len(toc[int(h2_prev)]) - 1
elif header.name == "h4":
toc[int(h2_prev)][int(h3_prev)].append(data)
h4_prev = len(toc[int(h2_prev)][int(h3_prev)]) - 1
elif header.name == "h5":
toc[int(h2_prev)][int(h3_prev)][int(h4_prev)].append(data)
h5_prev = len(toc[int(h2_prev)][int(h3_prev)][int(h4_prev)]) - 1
elif header.name == "h6":
toc[int(h2_prev)][int(h3_prev)][int(h4_prev)][int(h5_prev)].append(data)
toc_html = list_to_html(toc)
How do I generate a table of contents for HTML text in Python?
But I think you are on the right track and reinventing the wheel will be fun.

Categories

Resources