Hi I'm new to Python and crawling. I've been researching and going over Stackoverflow and come up with Python + Selenium to open Webdriver to open the URL and get the page source and turn it into the data I need. However, I know there's a better approach (for example, scraping w/o selenium, not having to scrape page source, posting data to ASP, etc) and I hope I can seek some help here for an educational purpose.
Here's what I'd like to achieve.
Start:
http://www.asos.com/Women/New-In-Clothing/Cat/pgecategory.aspx?cid=2623
Obtain: product title, price, img, and its link
Next: go to next page if there is, if not, output
BEFORE you go into my code, here is some background information. Asos is a site that uses pagination so this is related to scraping through multipages. Also, I tried without Selenium by posting to http://www.asos.com/services/srvWebCategory.asmx/GetWebCategories
with this data:
{'cid':'2623', 'strQuery':"", 'strValues':'undefined', 'currentPage':'0',
'pageSize':'204','pageSort':'-1','countryId':'10085','maxResultCount':''}
but there's no return I get.
I know my approach is not good, I'd much appreciate any help/recommendation/approach/idea! Thanks!
import scrapy
import time
import logging
from random import randint
from selenium import webdriver
from asos.items import ASOSItem
from scrapy.selector import Selector
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class ASOSSpider(scrapy.Spider):
name = "asos"
allowed_domains = ["asos.com"]
start_urls = [
"http://www.asos.com/Women/New-In-Clothing/Cat/pgecategory.aspx?cid=2623#/parentID=-1&pge=0&pgeSize=204&sort="
]
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
view_204 = self.driver.find_element_by_xpath("//div[#class='product-count-bottom']/a[#class='view-max-paged']")
view_204.click() #click and show 204 pictures
time.sleep(5) #wait till 204 images loaded, I've also tried the explicit wait, but i got timedout
# element = WebDriverWait(self.driver, 8).until(EC.presence_of_element_located((By.XPATH, "category-controls bottom")))
logging.debug("wait time has reached! go CRAWL!")
next = self.driver.find_element_by_xpath("//li[#class='page-skip']/a")
pageSource = Selector(text=self.driver.page_source) # load page source instead, cant seem to crawl the page by just passing the reqular request
for sel in pageSource.xpath("//ul[#id='items']/li"):
item = ASOSItem()
item["product_title"] = sel.xpath("a[#class='desc']/text()").extract()
item["product_link"] = sel.xpath("a[#class='desc']/#href").extract()
item["product_price"] = sel.xpath("div/span[#class='price']/text()").extract()
item["product_img"] = sel.xpath("div/a[#class='productImageLink']/img/#src").extract()
yield item
next.click()
self.driver.close()
Related
I need to scrape the titles for all blog post articles via a Load More button as set by my desired range for i in range(1,3):
At present I'm only able to capture the titles for the first page even though i'm able to navigate to the next page using selenium.
Any help would be much appreciated.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
# Selenium Routine
from requests_html import HTMLSession
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
# Removes SSL Issues With Chrome
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('log-level=3')
options.add_argument('--disable-notifications')
#options.add_argument('--headless') # Comment to view browser actions
# Get website url
urls = "https://jooble.org/blog/"
r = requests.get(urls)
driver = webdriver.Chrome(executable_path="C:\webdrivers\chromedriver.exe",options=options)
driver.get(urls)
productlist = []
for i in range(1,3):
# Get Page Information
soup = BeautifulSoup(r.content, features='lxml')
items = soup.find_all('div', class_ = 'post')
print(f'LOOP: start [{len(items)}]')
for single_item in items:
title = single_item.find('div', class_ = 'front__news-title').text.strip()
print('Title:', title)
product = {
'Title': title,
}
productlist.append(product)
print()
time.sleep(5)
WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH,"//button[normalize-space()='Show more']"))).send_keys(Keys.ENTER)
driver.close()
# Save Results
df = pd.DataFrame(productlist)
df.to_csv('Results.csv', index=False)
It do not need selenium overhead in this case, cause you can use requests directly to get quetsion specific data via api.
Try to check the network tab in your browsers devtools if you click the button and you get the url that is requested to load more content. Iterate and set parameter value &page={i}.
Example
import requests
import pandas as pd
from bs4 import BeautifulSoup
data = []
for i in range (1,3):
url = f'https://jooble.org/blog/wp-admin/admin-ajax.php?id=&post_id=0&slug=home&canonical_url=https%3A%2F%2Fjooble.org%2Fblog%2F&posts_per_page=6&page={i}&offset=20&post_type=post&repeater=default&seo_start_page=1&preloaded=false&preloaded_amount=0&lang=en&order=DESC&orderby=date&action=alm_get_posts&query_type=standard'
r=requests.get(url)
if r.status_code != 200:
print(f'Error occured: {r.status_code} on url: {url}')
else:
soup = BeautifulSoup(str(r.json()['html']))
for e in soup.select('.type-post'):
data.append({
'title':e.select_one('.front__news-title').get_text(strip=True),
'description':e.select_one('.front__news-description').get_text(strip=True),
'url':e.a.get('href')
})
pd.DataFrame(data)
Output
title
description
url
0
How To Become A Copywriter
If you have a flair for writing, you might consider leveraging your talents to earn some dough by working as a copywriter. The primary aim of a copywriter is to…
https://jooble.org/blog/how-to-become-a-copywriter/
1
How to Find a Job in 48 Hours
A job search might sound scary for many people. However, it doesn't have to be challenging, long, or overwhelming. With Jooble, it is possible to find the best employment opportunities…
https://jooble.org/blog/how-to-find-a-job-in-48-hours/
2
17 Popular Jobs That Involve Working With Animals
If you are interested in caring for or helping animals, you can build a successful career in this field. The main thing is to find the right way. Working with…
https://jooble.org/blog/17-popular-jobs-that-involve-working-with-animals/
3
How to Identify Phishing and Email Scam
What Phishing and Email Scam Are Cybercrime is prospering, and more and more internet users are afflicted daily. The best example of an online scam is the phishing approach -…
https://jooble.org/blog/how-to-identify-phishing-and-email-scam/
4
What To Do After Getting Fired
For many people, thoughts of getting fired tend to be spine-chilling. No wonder, since it means your everyday life gets upside down in minutes. Who would like to go through…
https://jooble.org/blog/what-to-do-after-getting-fired/
5
A mobile application for a job search in 69 countries has appeared
Jooble, a job search site in 69 countries, has launched the Jooble Job Search mobile app for iOS and Android. It will help the searcher view vacancies more conveniently and…
https://jooble.org/blog/a-mobile-application-for-a-job-search-in-69-countries-has-appeared/
...
Experts here, I am searching for your help if you don't mind it.
Recently, I am working out a web crawler using scrapy and selenium in python. My mind has crush.
I just want to ask whether it is possible that you still get empty even if you've used the statement
WebDriverWait(driver, 100, 0.1).until(EC.presence_of_all_elements_located((By.XPATH,xxxxx)))
to get those elements. And also, it even doesn't take 100 second to get empty. Why?
And by the way, it is a random thing, which means this phenomenon happens anywhere, anytime.
Does getting empty had something about my network connection?
Could you help me or give me some opinions, suggestion about the question above?
Thanks a lot!
-----------------------supplementary notes-----------------------
Thanks for the heads up.
In summary, I used scrapy and selenium to crawl a site about reviews and write the username, posting time, comment content, etc. to a .xlsx file via pipeline.py, I wanted it to be as fast as possible while gathering complete information.
A page with many people commenting, and because the review text is too long it gets put away, which means that almost 20 comments per page have their expand button.
Therefore, I need to use selenium to click the expand button and then use driver to fetch the complete comment. Common sense dictates that it takes a bit of time to load after the expand button is clicked, and I believe the time it takes depends on the speed of the network. So using WebDriverWait seems to be a wise choice here. After my practice, the default parameters timeout=10 and poll_frequency=0.5 seem to be too slow and error-prone. So I considered using the specifications of timeout=100 and poll_frequency=0.1.
However, the problem is that every time I run the project through the cmd statement scrapy crawl spider, there are always several comment crawls that are empty, and each time the location of the empty is different. I've thought about using time.sleep() to force a stop, but that would take a lot of time if every page did that, and while it's certainly a more useful way to get complete information. Also, it's looks not so elegant and a little bit clumsy in my opinion.
Have I express my question clearly?
-------------------------------add something--------------------------------
The exact meaning of I got somwhere empty is shown as the picture below.
---------------------------add my code--------------------------2022/5/18
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('https://movie.douban.com/subject/5045678/reviews?start=0')
users = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'a[class=name]')]
dates = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'span[class=main-meta]')]
full_content, words = [], []
unfolds = WebDriverWait(driver,100,0.1).until(EC.presence_of_all_elements_located((By.XPATH,"//a[#class='unfold']")))
# Here's how I think about and design my loop body.
# I click the expansion bottun, then grab the text, then put it away, then move on to the next one.
for i in range(len(unfolds)):
unfolds[i].click()
time.sleep(1)
# After the javascript, the `div[#class='review-content clearfix']` appear,
# and some of the full review content will be put in a `<p></p>` label
find_full_content_p = WebDriverWait(driver,100,0.1).until(EC.presence_of_all_elements_located((By.XPATH,"//div[#class='review-content clearfix']/p")))
full_content_p = [j.text for j in find_full_content_p]
# and some of them will just put in `div[#class='review-content clearfix']` itself.
find_full_content_div = WebDriverWait(driver,100,0.1).until(EC.presence_of_all_elements_located((By.XPATH,"//div[#class='review-content clearfix']")))
full_content_div = [j.text for j in find_full_content_div]
# and I made a list merge
full_content_p.extend(full_content_div)
full_content.append("".join(full_content_p))
words.append(len("".join(full_content_p)))
time.sleep(1)
# then put it away
WebDriverWait(driver,100,0.1).until(EC.element_to_be_clickable((By.XPATH,"//a[#class='fold']"))).click()
driver.close()
pd.DataFrame({"users":users, "dates":dates, "full_content":full_content, "words":words})
AND, this is the code of an expert I genuinely respect named sound wave.(This is slightly modified, the core code has not been changed)
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.service import Service
driver = webdriver.Chrome()
driver.get('https://movie.douban.com/subject/5045678/reviews?start=0')
users = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'a[class=name]')]
dates = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'span[class=main-meta]')]
reviews, words = [], []
for review in driver.find_elements(By.CSS_SELECTOR, 'div.review-short'):
show_more = review.find_elements(By.CSS_SELECTOR, 'a.unfold')
if show_more:
# scroll to the show more button, needed to avoid ElementClickInterceptedException
driver.execute_script('arguments[0].scrollIntoView({block: "center"});', show_more[0])
show_more[0].click()
review = review.find_element(By.XPATH, 'following-sibling::div')
while review.get_attribute('class') == 'hidden':
time.sleep(0.2)
review = review.find_element(By.CSS_SELECTOR, 'div.review-content')
reviews.append(review.text)
words.append(len(review.text))
print('done',len(reviews),end='\r')
pd.DataFrame({"users":users,"dates":dates,"reviews":reviews,"words":words})
NEW
Added code for the site douban. To export the scraped data to a csv see the pandas code in the OLD section below
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
driver = webdriver.Chrome(service=Service('...'))
driver.get('https://movie.douban.com/subject/5045678/reviews?start=0')
users = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'a[class=name]')]
dates = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'span[class=main-meta]')]
reviews = []
for review in driver.find_elements(By.CSS_SELECTOR, 'div.review-short'):
show_more = review.find_elements(By.CSS_SELECTOR, 'a.unfold')
if show_more:
# scroll to the show more button, needed to avoid ElementClickInterceptedException
driver.execute_script('arguments[0].scrollIntoView({block: "center"});', show_more[0])
show_more[0].click()
review = review.find_element(By.XPATH, 'following-sibling::div')
while review.get_attribute('class') == 'hidden':
time.sleep(0.2)
review = review.find_element(By.CSS_SELECTOR, 'div.review-content')
reviews.append(review.text)
print('done',len(reviews),end='\r')
OLD
For the website you mentioned (imdb.com) in order to scrape the hidden content there is no need to click on the show more button because the text is already loaded in the HTML code, simply it is not shown on the site. So you can scrape all the comments in a single time. Code below stores users, dates and reviews in seprate lists, and finally save data to a .csv file.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
driver = webdriver.Chrome(service=Service(chromedriver_path))
driver.get('https://www.imdb.com/title/tt1683526/reviews')
# sets a maximum waiting time for .find_element() and similar commands
driver.implicitly_wait(10)
reviews = [el.get_attribute('innerText') for el in driver.find_elements(By.CSS_SELECTOR, 'div.text')]
users = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'span.display-name-link')]
dates = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'span.review-date')]
# store data in a csv file
import pandas as pd
df = pd.DataFrame(list(zip(users,dates,reviews)), columns=['user','date','review'])
df.to_csv(r'C:\Users\your_name\Desktop\data.csv', index=False)
To print a single review you can do something like this
i = 0
print(f'User: {users[i]}\nDate: {dates[i]}\n{reviews[i]}')
the output (truncated) is
User: dschmeding
Date: 26 February 2012
Wow! I was not expecting this movie to be this engaging. Its one of those films...
I'm having issues scraping data from a website. The issue might be with Visual Studio Code, I am using the "Code Runner" extension. This is my first time using Beautiful Soup and Selenium so the issue might also be with my code. I started last Friday and after some difficulty came up with a solution on Saturday. My code is:
import requests
from bs4 import BeautifulSoup, SoupStrainer
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
parcelID = 1014100000 #this is a random parcelID I grabbed from the site
url = 'https://www.manateepao.com/parcel/?parid={}'.format(parcelID)
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.documentElement.outerHTML")
#was getting encoding error with print(html). replaced character that was giving me trouble
newHTML = html.replace(u"\u2715", "*")
soupFilter = SoupStrainer('div', {'id': 'ownerContentScollContainer'})
soup = BeautifulSoup(newHTML, 'html.parser', parse_only=soupFilter)
webparcelID = soup.find_all('b')
lColumn = soup.find_all('div', {'class' : 'col-sm-2 m-0 p-0 text-sm-right'})
rColumn = soup.find_all('div', {'class' : 'col-sm m-0 p-0 ml-2'})
parcel_Dict = {}
for i in range(len(lColumn)):
parcel_Dict[i] = {lColumn[i].string: rColumn[i].string}
#This is to test if I got any results or not
print(parcel_Dict)
driver.close()
driver.quit()
What I am hoping to find each time I scrape a page is:
The Parcel ID. This is in its own bold, b, tag.
The Ownership and Mailing Address. The Ownership should always be at parcel_Dict[1] and the mailing address should always be at parcel_Dict[3].
I run the code and sometimes I get a result, and other times I get an empty dictionary.
Thank you for any help you can provide.
I solved my own issue by adding the following lines of code
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[#id='ownerContentScollContainer']")))
I waited until the ownerContentScrollContainer was fully loaded before proceeding to execute the rest of the code.
This post and this post helped me figure out where I might be going wrong. I used this tutorial to figure out how to use the appropriate Xpath.
I am trying to extract data from a webpage (https://clinicaltrials.gov), I have built a scraper using selenium and lxml and it is working fine. I need to hit the next page button once the first page scraping is done, after going to the next page I need to take the url of that page using (driver.current_url) and start the scraping again.
Here the problem is search results table only changing but the URL remaining same. So whenever driver taking current url (driver.current_url) first page results coming again and again.
Edited:
here is the code
import re
import time
import urllib.parse
import lxml.html
import pandas as pd
import requests
import urllib3
from lxml import etree
from lxml import html
from pandas import ExcelFile
from pandas import ExcelWriter
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
siteurl = 'https://clinicaltrials.gov/'
driver = webdriver.Chrome()
driver.get(siteurl)
WebDriverWait(driver, 5)
driver.maximize_window()
def advancesearch():
driver.find_element_by_link_text('Advanced Search').click()
driver.find_element_by_id('StartDateStart').send_keys('01/01/2016')
driver.find_element_by_id('StartDateEnd').send_keys('12/30/2020')
webdriver.ActionChains(driver).send_keys(Keys.ENTER).perform()
time.sleep(3)
driver.find_element_by_xpath("//input[contains(#id, 'home-search-condition-query')]").send_keys('medicine') #Give keyword here
advancesearch()
#driver.find_element_by_xpath("//div[contains(#class, 'dataTables_length')]//label//select//option[4]").click()
#time.sleep(8)
def nextbutton():
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
driver.find_element_by_xpath("//a[contains(#class, 'paginate_button next')]").click()
def extractor():
cur_url = driver.current_url
read_url = requests.get(cur_url)
souptree = html.fromstring(read_url.content)
tburl = souptree.xpath("//table[contains(#id, 'theDataTable')]//tbody//tr//td[4]//a//#href")
for tbu in tburl:
allurl = []
allurl.append(urllib.parse.urljoin(siteurl, tbu))
for tb in allurl:
get_url = requests.get(tb)
get_soup = html.fromstring(get_url.content)
pattern = re.compile("^\s+|\s*,\s*|\s+$")
name = get_soup.xpath('//td[#headers="contactName"]//text()')
phone = get_soup.xpath('//td[#headers="contactPhone"]//text()')
mail = get_soup.xpath('//td[#headers="contactEmail"]//a//text()')
artitle = get_soup.xpath('//td[#headers="contactEmail"]//a//#href')
artit = ([x for x in pattern.split(str(artitle)) if x][-1])
title = artit[:-2]
for (names, phones, mails) in zip(name, phone, mail):
fullname = names[9:]
print(fullname, phones, mails, title, sep='\t')
while True:
extractor()
nextbutton()
You don't need to get the URL if the page it's already change.
You could start re-iterating from when the page has reloaded after you click next. You can make the driver wait until an element is present (explicit wait) or just wait (implicit wait).
There are a number of changes (for example, use shorter less fragile css selectors and bs4) I would probably make but the two that stand out:
1) You already have the data you need so there is no requirement for a new url. Simply use the current page_source of driver.
So top of extractor function becomes
def extractor():
souptree = html.fromstring(driver.page_source)
tburl = souptree.xpath("//table[contains(#id, 'theDataTable')]//tbody//tr//td[4]//a//#href")
#rest of code
2) To reduce iterations I would set results count to 100 at start
def advancesearch():
driver.find_element_by_link_text('Advanced Search').click()
driver.find_element_by_id('StartDateStart').send_keys('01/01/2016')
driver.find_element_by_id('StartDateEnd').send_keys('12/30/2020')
webdriver.ActionChains(driver).send_keys(Keys.ENTER).perform()
time.sleep(3)
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#theDataTable_length [value='100']"))).click() #change for 100 results so less looping
then add additional import of
from selenium.webdriver.common.by import By
I need to continuously get the data on next button <1 2 3 ... 5> but there's no provided href link in the source also there's also elipsis. any idea please? here's my code
def start_requests(self):
urls = (
(self.parse_2, 'https://www.forever21.com/us/shop/catalog/category/f21/sale'),
)
for cb, url in urls:
yield scrapy.Request(url, callback=cb)
def parse_2(self, response):
for product_item_forever in response.css('div.pi_container'):
forever_item = {
'forever-title': product_item_forever.css('p.p_name::text').extract_first(),
'forever-regular-price': product_item_forever.css('span.p_old_price::text').extract_first(),
'forever-sale-price': product_item_forever.css('span.p_sale.t_pink::text').extract_first(),
'forever-photo-url': product_item_forever.css('img::attr(data-original)').extract_first(),
'forever-description-url': product_item_forever.css('a.item_slider.product_link::attr(href)').extract_first(),
}
yield forever_item
Please help me thank you
It seems this pagination uses additional request to API.
So, there are two ways:
Use Splash/Selenium to render pages by pattern of QHarr;
Make same calls to API. Check developer tools, you will find POST-request https://www.forever21.com/us/shop/Catalog/GetProducts will all proper params (they are too long, so I will not post full list here).
The url changes so you can specify page number and results per page in the url e.g.
https://www.forever21.com/uk/shop/catalog/category/f21/sale/#pageno=2&pageSize=120&filter=price:0,250
As mentioned by #vezunchik and OP feedback, this approach requires selenium/splash to allow js to run on the page. If you were going down that route you could just click the next ( .p_next) until you get the end page as it is easy to grab the last page number (.dot + .pageno)from the document.
I appreciate you are trying with scrapy.
Demo of the idea with selenium in case helps.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url_loop = 'https://www.forever21.com/uk/shop/catalog/category/f21/sale/#pageno={}&pageSize=120&filter=price:0,250'
url = 'https://www.forever21.com/uk/shop/catalog/category/f21/sale'
d = webdriver.Chrome()
d.get(url)
d.find_element_by_css_selector('[onclick="fnAcceptCookieUse()"]').click() #get rid of cookies
items = WebDriverWait(d,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#products .p_item")))
d.find_element_by_css_selector('.selectedpagesize').click()
d.find_elements_by_css_selector('.pagesize')[-1].click() #set page result count to 120
last_page = int(d.find_element_by_css_selector('.dot + .pageno').text) #get last page
if last_page > 1:
for page in range(2, last_page + 1):
url = url_loop.format(page)
d.get(url)
try:
d.find_element_by_css_selector('[type=reset]').click() #reject offer
except:
pass
# do something with page
break #delete later