Not able to scrape information from a website using lxml - python

I am trying to scrape data of user review beer on beeradvocate.com to analyze user attitude towards different of beer type. But I can only have result of the first few page, remain is empty
Situation:
There are 500 different type of beer, each beer has different number of rating and reviews
Site only show 1 page of results for guest, to see all the information, you need to login
My approach
Get the beer link, number of rating of each beer to define range of loop for each beer
Login using request session and post
def review_scrape (beer_link, number_of_ratings):
reviews=[]
rate =[]
for pages_i in range(0,int(number_of_ratings),25): #site shows 25 resulst/page
session = requests.session() # Start the session
payload = {'login':'suzie102', 'password':''}
page1 = session.post("https://www.beeradvocate.com/community/login/login", data=payload)
url = beer_link+'/?view=beer&sort=&start=%d'%(pages_i)
page1= session.get(url)
time.sleep(3)
soup1 = lxml.html.fromstring(page1.text)
rate_i = soup1.xpath('//span[#class = "muted"]/text()')[8::3]
print(url)
reviews_i = soup1.xpath('//div/text()')
reviews.append(reviews_i)
print(len(reviews))
rate.append(rate_i)
return rate,reviews
Results:

There is only one problem that ive see.
url = beer_link+'/?view=beer&sort=&start=%d'%(pages_i)
/ is redudant, what you need is
url = beer_link+'?view=beer&sort=&start=%d'%(pages_i)
that is why there are //?view in your print of links.
I can see that there are anchor links "next" leding to next page. I would recommend while loop or recursion.
Other than that, I cant see what is missing from your script. Everything else looks in order and it should work.
If you could give us more details, we might have more to work with.

update, thanks to everyone comment, I tried to use it with selenium to scrape. It works now
def webstite_scrape_p2 (beer_link, number_of_ratings):
driver = webdriver.Chrome('/home/sam/Downloads/chromedriver')
url = 'https://www.beeradvocate.com/community/login/'
driver.get(url)
loginelement = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//form[#class="xenForm formOverlay"]//dd//input[#name ="login"]')))
loginelement.send_keys('suzie102')
pwelement = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//form[#class="xenForm formOverlay"]//dl[#class ="ctrlUnit"]//dd//ul//li[#id = "ctrl_pageLogin_registered_Disabler"]//input[#name ="password"]')))
pwelement.send_keys('')
page_click = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//form[#class="xenForm formOverlay"]//dl[#class ="ctrlUnit submitUnit"]//dd//input[#type ="submit"]')))
page_click.click()
rate = []
reviews =[]
avg_user =[]
for link, i in zip(beer_link, number_of_rev):
for pages_i in tqdm(range(0,int(i),25)): #site shows 25 resulst/page)
new_url = link+'?view=beer&sort=&start=%d'%(pages_i)
print(new_url)
driver.get(new_url)
#print(driver.find_element_by_name("hideRatings").is_selected())
#check_box = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH, '//form[#style="display:inline;margin:0;padding:0;"]//input[#type = "checkbox"]')))#check_box.click()
#check_box.click()
time.sleep(5)
driver.get(new_url)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
rate_i = [ i.get_text() for i in soup.find_all('span', class_ = "muted")][8::3]
rate.append(rate_i)
reviews_i = [ i.get_text() for i in soup.find_all('div')]
reviews.append(reviews_i)
avg_i = [i.get_text() for i in soup.find_all('span', class_= "BAscore_norm")]
avg_user.append(avg_i)
return rate, reviews, avg_user

Related

selenium python: Unable to parse data in <object> tag with #document section

url = 'http://www.mtv.de/charts/c6mc86/single-top-100?expanded=true'
chromedriver = Service("/usr/local/bin/chromedriver")
op = webdriver.ChromeOptions()
browser = webdriver.Chrome(service=chromedriver, options=op)
browser.get(url)
timeout = 60
browser.implicitly_wait(20)
browser.execute_script("window.scrollTo(0, document.body.scrollHeight,)")
time.sleep(5)
try:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, '/html/body/div[1]/main/div/section/div/div/div/object')))
print('========================')
except TimeoutException:
browser.quit()
items = browser.switch_to.frame(browser.find_element(By.TAG_NAME,'object'))
print(items)
itembox = items.find_elements(By.CLASS_NAME, 'charts-marslnet')
# print(itembox)
for item in itembox:
print(item.text)
I have been trying to scrap the song name, author and url for the song from this website but unable to access the html inside the tag under #document section. I am not able to figure why i cant access it. Any insights on what can be the issue with my code or what should be done to access this html inside #document section would be very helpful.
[HTML inside the tag with #document(Screenshot 2][1]
You can grab it from the direct url:
import requests
from bs4 import BeautifulSoup
url = 'https://mtv.marsl.net/demo/showdbcharts.php?c=4'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
acts = soup.find_all('div', {'class':'cmn-act'})
for each in acts:
title = each.find_next('div', {'class':'cmn-title'}).text.strip()
artist = each.find_next('div', {'class':'cmn-artist'}).text.strip()
link = each.find_next('a', href=True)['href']
print(f'{title}\n{artist}\n{link}\n\n')
Output:
abcdefu
Gayle
https://www.mtv.de/musikvideos/r9d9sl/abcdefu
Wenn ich will
Gzuz & Bonez MC
https://www.mtv.de/musikvideos/7evkst/10von10
10von10
Pajel
https://www.mtv.de/musikvideos/7evkst/10von10
Shivers
Ed Sheeran
https://www.mtv.de/musikvideos/miq9lq/shivers
Heat Waves
Glass Animals
https://www.mtv.de/musikvideos/l9rv5d/heat-waves
...

Selenium: How to scrape/crawl until last page?

So I currently have a function:
def main(search_term):
# RUN MAIN PROGRAM ROUTINE
chromedriver = "chromedriver path"
driver = webdriver.Chrome(chromedriver)
records = []
url = get_url(search_term)
# SELECT NUMBER OF PAGES TO CRAWL
#
for page in range(1, 21):
#for page in itertools.count():
driver.get(url.format(page))
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
print(page)
for item in results:
record = extract_record(item)
if record:
records.append(record)
which scrapes data from page 1 to page 21 of the search result given the search_term like "electronics" or "cosmetics" or "airpod pro case"
However, I realized some search results gives me pages from page 1 to 3, page 1 to 7, page 1 to 20 and so on depending how specific my search_term is.
I was thinking I could scrape the data if next button is enabled until my code notices that the next button is disabled, which would mean it is the last page of the result.
The xpaths of the enabled next button and the disabled next button are:
next_button_enabled = driver.find_element_by_xpath('//li[#class="a-last"]')
next_button_disabled = driver.find_element_by_xpath('//li[#class="a-disabled a-last"]')
but I am not sure how to work with this information with what I have written so far.
Since this is what a page url looks like https://www.amazon.com/s?k=phone&page=2 you can do some basic link hacking. The only thing you'll need to find out is how many pages there are in total. soup.find('ul', class_="a-pagination").find_all('li') will retrieve the pagination list. The last page number is in the second last item in that list:
from selenium import webdriver
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
url='https://www.amazon.com/s?k=phone' #or https://www.amazon.com/s?k=maison+kitsune+airpod+pro+case
wd = webdriver.Chrome('chromedriver',options=options)
wd.get(url)
soup = BeautifulSoup(wd.page_source, "html.parser")
last_page = int([i.get_text() for i in soup.find('ul', class_="a-pagination").find_all('li')][-2])
for page in range(2, last_page + 1):
page_url = f'{url}&page={page}'
#get url with Selenium etc.

I am Scraping multiple web pages which gives the same results as the first page in Python selenium. What would be the reason?

I am scraping goodreads.com using Selenium and Beautiful soup. I can able to get the results for the first page. When I give the URL for the second page then it loads the first page and gives the first page results only. I tried with different pages and all loads the first page only. What would be the reason and how to overcome this?
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
import pandas as pd
import time
import re
import requests
from itertools import zip_longest
from webdriver_manager.chrome import ChromeDriverManager
# First-page site URL: https://www.goodreads.com/shelf/show/business?page=1
driver = webdriver.Chrome(ChromeDriverManager().install())
# Reading the second page
driver.get("https://www.goodreads.com/shelf/show/non-fiction?page=2")
time.sleep(3)
summaryItems = driver.find_elements_by_xpath("//a[contains(#class, 'bookTitle')]")
job_links = [summaryItem.get_attribute("href") for summaryItem in summaryItems]
for job_link in job_links:
driver.get(job_link)
#Closing the pop-up window
try:
close = driver.find_elements_by_class_name('gr-iconButton')
close.click()
except:
close = "None"
try:
# Taking book description
more = driver.find_element_by_css_selector("#description > a:nth-child(3)").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
#for item in soup.findAll("span", id=re.compile("^freeText"))[:2]:
# print(item.text)
sections = soup.findAll("span", id=re.compile("^freeText"))[:2]
print("message ")
i = 0
for item in soup.findAll("span", id=re.compile("^freeText"))[:2]:
i = i+1
if i == 2:
desc.append(item.text)
except:
more = "None"
try: # Taking book title
# time.sleep(2)
job_title = driver.find_element_by_xpath("//h1[#class='gr-h1 gr-h1--serif']").text
#job_title = driver.find_element_by_id('bookTitle').find_element_by_class_name('gr-h1 gr-h1--serif').text
title.append(job_title)
#print(title)
except:
job_title = "None"
#Taking Author name
try:
# time.sleep(2)
authors = driver.find_element_by_xpath("//a[#class='authorName']").text
author.append(authors)
#print(author)
except:
authors = "None"
#Taking Ratings
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
rate = soup.find("span", itemprop="ratingValue").text.strip()
rates = rate.replace('\n','')
rating.append(rates)
driver.close()
Output:
I am able to scrape book title, author name, book description, and rating for the first page only.
You should login first to scrap data on other page.
Try to add following code into your script:
driver = webdriver.Chrome(ChromeDriverManager().install())
# Add below code after webdriver.Chrome()
driver.get("https://www.goodreads.com/user/sign_in")
time.sleep(5)
driver.find_element_by_css_selector("#user_email").send_keys("your email")
driver.find_element_by_css_selector("#user_password").send_keys("your password")
driver.find_element_by_xpath("//input[#type='submit' and #value='Sign in']").click()

Python scraper advice

I have been working on a scraper for a little while now, and have come very close to getting it to run as intended. My code as follows:
import urllib.request
from bs4 import BeautifulSoup
# Crawls main site to get a list of city URLs
def getCityLinks():
city_sauce = urllib.request.urlopen('https://www.prodigy-living.co.uk/') # Enter url here
city_soup = BeautifulSoup(city_sauce, 'html.parser')
the_city_links = []
for city in city_soup.findAll('div', class_="city-location-menu"):
for a in city.findAll('a', href=True, text=True):
the_city_links.append('https://www.prodigy-living.co.uk/' + a['href'])
return the_city_links
# Crawls each of the city web pages to get a list of unit URLs
def getUnitLinks():
getCityLinks()
for the_city_links in getCityLinks():
unit_sauce = urllib.request.urlopen(the_city_links)
unit_soup = BeautifulSoup(unit_sauce, 'html.parser')
for unit_href in unit_soup.findAll('a', class_="btn white-green icon-right-open-big", href=True):
yield('the_url' + unit_href['href'])
the_unit_links = []
for link in getUnitLinks():
the_unit_links.append(link)
# Soups returns all of the html for the items in the_unit_links
def soups():
for the_links in the_unit_links:
try:
sauce = urllib.request.urlopen(the_links)
for things in sauce:
soup_maker = BeautifulSoup(things, 'html.parser')
yield(soup_maker)
except:
print('Invalid url')
# Below scrapes property name, room type and room price
def getPropNames(soup):
try:
for propName in soup.findAll('div', class_="property-cta"):
for h1 in propName.findAll('h1'):
print(h1.text)
except:
print('Name not found')
def getPrice(soup):
try:
for price in soup.findAll('p', class_="room-price"):
print(price.text)
except:
print('Price not found')
def getRoom(soup):
try:
for theRoom in soup.findAll('div', class_="featured-item-inner"):
for h5 in theRoom.findAll('h5'):
print(h5.text)
except:
print('Room not found')
for soup in soups():
getPropNames(soup)
getPrice(soup)
getRoom(soup)
When I run this, it returns all the prices for all the urls picked up. However, I does not return the names or the rooms and I am not really sure why. I would really appreciate any pointers on this, or ways to improve my code - been learning Python for a few months now!
I think that the links you are scraping will in the end redirect you to another website, in which case your scraping functions will not be useful!
For instance, the link for a room in Birmingham is redirecting you to another website.
Also, be careful in your usage of the find and find_all methods in BS. The first returns only one tag (as when you want one property name) while find_all() will return a list allowing you to get, for instance, multiple room prices and types.
Anyway, I have simplified a bit your code and this is how I have come across your issue. Maybe you would like to get some inspiration from that:
import requests
from bs4 import BeautifulSoup
main_url = "https://www.prodigy-living.co.uk/"
# Getting individual cities url
re = requests.get(main_url)
soup = BeautifulSoup(re.text, "html.parser")
city_tags = soup.find("div", class_ = "footer-city-nav") # Bottom page not loaded dynamycally
cities_links = [main_url+tag["href"] for tag in city_tags.find_all("a")] # Links to cities
# Getting the individual links to the apts
indiv_apts = []
for link in cities_links[0:4]:
print "At link: ", link
re = requests.get(link)
soup = BeautifulSoup(re.text, "html.parser")
links_tags = soup.find_all("a", class_ = "btn white-green icon-right-open-big")
for url in links_tags:
indiv_apts.append(main_url+url.get("href"))
# Now defining your functions
def GetName(tag):
print tag.find("h1").get_text()
def GetType_Price(tags_list):
for tag in tags_list:
print tag.find("h5").get_text()
print tag.find("p", class_ = "room-price").get_text()
# Now scraping teach of the apts - name, price, room.
for link in indiv_apts[0:2]:
print "At link: ", link
re = requests.get(link)
soup = BeautifulSoup(re.text, "html.parser")
property_tag = soup.find("div", class_ = "property-cta")
rooms_tags = soup.find_all("div", class_ = "featured-item")
GetName(property_tag)
GetType_Price(rooms_tags)
You will see that right at the second element of the lis, you will get an AttributeError as you are not on your website page anymore. Indeed:
>>> print indiv_apts[1]
https://www.prodigy-living.co.uk/http://www.iqstudentaccommodation.com/student-accommodation/birmingham/penworks-house?utm_source=prodigylivingwebsite&utm_campaign=birminghampagepenworksbutton&utm_medium=referral # You will not scrape the expected link right at the beginning
Next time come with a precise problem to solve, or in another case just take a look at the code review section.
On find and find_all: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#calling-a-tag-is-like-calling-find-all
Finally, I think it also answers your question here: https://stackoverflow.com/questions/42506033/urllib-error-urlerror-urlopen-error-errno-11001-getaddrinfo-failed
Cheers :)

Scraping data from href

I was trying to get the postcodes for DFS, for that i tried getting the href for each shop and then click on it, the next page has shop location from which i can get the postal code, but i am able to get things working, Where am i going wrong?
I tried getting upper level attribute first td.searchResults and then for each of them i am trying to click on href with title DFS and after clicking getting the postalCode. Eventually iterate for all three pages.
If there is a better way to do it let me know.
driver = webdriver.Firefox()
driver.get('http://www.localstore.co.uk/stores/75061/dfs/')
html = driver.page_source
soup = BeautifulSoup(html)
listings = soup.select('td.searchResults')
for l in listings:
while True:
driver.find_element_by_css_selector("a[title*='DFS']").click()
shops= {}
#info = soup.find('span', itemprop='postalCode').contents
html = driver.page_source
soup = BeautifulSoup(html)
info = soup.find(itemprop="postalCode").get_text()
shops.append(info)
Update:
driver = webdriver.Firefox()
driver.get('http://www.localstore.co.uk/stores/75061/dfs/')
html = driver.page_source
soup = BeautifulSoup(html)
listings = soup.select('td.searchResults')
for l in listings:
driver.find_element_by_css_selector("a[title*='DFS']").click()
shops = []
html = driver.page_source
soup = BeautifulSoup(html)
info = soup.find_all('span', attrs={"itemprop": "postalCode"})
for m in info:
if m:
m_text = m.get_text()
shops.append(m_text)
print (shops)
So after playing with this for a little while, I don't think the best way to do this is with selenium. It would require using driver.back() and waiting for elements to re-appear, and a whole mess of other stuff. I was able to get what you want using just requests, re and bs4. re is included in the Python standard library and if you haven't installed requests, you can do it with pip as follows: pip install requests
from bs4 import BeautifulSoup
import re
import requests
base_url = 'http://www.localstore.co.uk'
url = 'http://www.localstore.co.uk/stores/75061/dfs/'
res = requests.get(url)
soup = BeautifulSoup(res.text)
shops = []
links = soup.find_all('a', href=re.compile('.*\/store\/.*'))
for l in links:
full_link = base_url + l['href']
town = l['title'].split(',')[1].strip()
res = requests.get(full_link)
soup = BeautifulSoup(res.text)
info = soup.find('span', attrs={"itemprop": "postalCode"})
postalcode = info.text
shops.append(dict(town_name=town, postal_code=postalcode))
print shops
Your code has some problems. You are using an infinite loop without breaking condition. Also shops= {} is a dict but you are using append method on it.
Instead of using selenium you can use python-requests or urllib2.
But In your code you can do something like this,
driver = webdriver.Firefox()
driver.get('http://www.localstore.co.uk/stores/75061/dfs/')
html = driver.page_source
soup = BeautifulSoup(html)
listings = soup.select('td.searchResults')
for l in listings:
driver.find_element_by_css_selector("a[title*='DFS']").click()
shops = []
html = driver.page_source
soup = BeautifulSoup(html)
info = soup.find('span', attrs={"itemprop": "postalCode"})
if info:
info_text = info.get_text()
shops.append(info_text)
print shops
In Beautifulsoup you can find a tag by it's attribute like this:
soup.find('span', attrs={"itemprop": "postalCode"})
also if it doesn't find anything, it will return None and .get_text() method on it will raise AttributeError. So check first before applying .get_text()

Categories

Resources