I am building a scraper for Ebay. I am trying to figure out a way to manipulate the page number portion of the Ebay url to go to the next page until there are no more pages (If you were on page 2 the page number portion would look like "_pgn=2"). I noticed that if you put any number greater than the max number of pages a listing has, the page will reload to the last page, not give like a page doesn't exist error. (If a listing has 5 pages, then the last listing' page number url portion of _pgn=5 would rout to the same page if the page number url portion was _pgn=100). How can I implement a way to start at page one, get the html soup of the page, get the all relevant data I want from the soup, then load up the next page with the new page number and start the process again until there are not any new pages to scrape? I tried to get the number of results a listing has by using selenium xpath and math.ceil the quotient of number of results and 50 (default number of max listings per page) and use that quotient as my max_page, but I get errors saying the element doesn't exist even though it does. self.driver.findxpath('xpath').text. That 243 is what I am trying to get with the xpath.
class EbayScraper(object):
def __init__(self, item, buying_type):
self.base_url = "https://www.ebay.com/sch/i.html?_nkw="
self.driver = webdriver.Chrome(r"chromedriver.exe")
self.item = item
self.buying_type = buying_type + "=1"
self.url_seperator = "&_sop=12&rt=nc&LH_"
self.url_seperator2 = "&_pgn="
self.page_num = "1"
def getPageUrl(self):
if self.buying_type == "Buy It Now=1":
self.buying_type = "BIN=1"
self.item = self.item.replace(" ", "+")
url = self.base_url + self.item + self.url_seperator + self.buying_type + self.url_seperator2 + self.page_num
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def getInfo(self, soup):
for listing in soup.find_all("li", {"class": "s-item"}):
raw = listing.find_all("a", {"class": "s-item__link"})
if raw:
raw_price = listing.find_all("span", {"class": "s-item__price"})[0]
raw_title = listing.find_all("h3", {"class": "s-item__title"})[0]
raw_link = listing.find_all("a", {"class": "s-item__link"})[0]
raw_condition = listing.find_all("span", {"class": "SECONDARY_INFO"})[0]
condition = raw_condition.text
price = float(raw_price.text[1:])
title = raw_title.text
link = raw_link['href']
print(title)
print(condition)
print(price)
if self.buying_type != "BIN=1":
raw_time_left = listing.find_all("span", {"class": "s-item__time-left"})[0]
time_left = raw_time_left.text[:-4]
print(time_left)
print(link)
print('\n')
if __name__ == '__main__':
item = input("Item: ")
buying_type = input("Buying Type (e.g, 'Buy It Now' or 'Auction'): ")
instance = EbayScraper(item, buying_type)
page = instance.getPageUrl()
instance.getInfo(page)
if you want to iterate all pages and gather all results then your script needs to check if there is a next page after you visit the page
import requests
from bs4 import BeautifulSoup
class EbayScraper(object):
def __init__(self, item, buying_type):
...
self.currentPage = 1
def get_url(self, page=1):
if self.buying_type == "Buy It Now=1":
self.buying_type = "BIN=1"
self.item = self.item.replace(" ", "+")
# _ipg=200 means that expect a 200 items per page
return '{}{}{}{}{}{}&_ipg=200'.format(
self.base_url, self.item, self.url_seperator, self.buying_type,
self.url_seperator2, page
)
def page_has_next(self, soup):
container = soup.find('ol', 'x-pagination__ol')
currentPage = container.find('li', 'x-pagination__li--selected')
next_sibling = currentPage.next_sibling
if next_sibling is None:
print(container)
return next_sibling is not None
def iterate_page(self):
# this will loop if there are more pages otherwise end
while True:
page = instance.getPageUrl(self.currentPage)
instance.getInfo(page)
if self.page_has_next(page) is False:
break
else:
self.currentPage += 1
def getPageUrl(self, pageNum):
url = self.get_url(pageNum)
print('page: ', url)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def getInfo(self, soup):
...
if __name__ == '__main__':
item = input("Item: ")
buying_type = input("Buying Type (e.g, 'Buy It Now' or 'Auction'): ")
instance = EbayScraper(item, buying_type)
instance.iterate_page()
the important functions here are page_has_next and iterate_page
page_has_next - a function that check if the pagination of the page has another li element next to the selected page. e.g < 1 2 3 > if we are on page 1 then it checks if there is 2 next -> something like this
iterate_page - a function that loop until there is no page_next
also note that you don't need selenium for this unless you need to mimic user clicks or need a browser to navigate.
Related
There is my code
def parser():
flag = True
url = 'https://quotes.toscrape.com'
while flag:
responce = requests.get(url)
soup = BeautifulSoup(responce.text, 'html.parser')
quote_l = soup.find_all('span', {'class': 'text'})
q_count = 0
for i in range(len(quote_l)):
if q_count >= 5:
flag = False
break
quote = soup.find_all('span', {'class': 'text'})[i]
if not Quote.objects.filter(quote=quote.string).exists():
author = soup.find_all('small', {'class': 'author'})[i]
if not Author.objects.filter(name=author.string).exists():
a = Author.objects.create(name=author.string)
Quote.objects.create(quote=quote.string, author_id=a.id)
q_count += 1
else:
a = Author.objects.get(name=author.string)
Quote.objects.create(quote=quote.string, author_id=a.id)
q_count += 1
url += soup.find('li', {'class': 'next'}).a['href']
I need to get the next page but I have this Exc. 'NoneType' object has no attribute 'a'
How to fix that and maybe how I can optimize my Code.Thx
Upon reaching the last page there will be no Next button so you need an exit condition check prior to attempting to access the href for next page. One possibility would be to add the following lines before your current last line:
next_page = soup.find('li', {'class': 'next'})
if not next_page: flag = False # or return
Or simply return at that point.
You'd also update the last line to use the variable, of course, and ensure you are not continuously extending url with suffixes of next page. For example, one could add the suffix during the requests call:
def parser():
flag = True
url = 'https://quotes.toscrape.com'
suffix = ''
while flag:
responce = requests.get(url + suffix)
soup = BeautifulSoup(responce.text, 'html.parser')
# other code
next_page = soup.find('li', {'class': 'next'})
if not next_page:
return
suffix = next_page.a['href']
I am in my infancy of python coding. What I am trying to do is build a web scraper which gets all the links from a website and then returns the elements form each site. The code I started with is from https://www.thepythoncode.com/article/extract-all-website-links-python
this works really nicely to get all the links from a website.
As I am only interested in the internal links I have added some extra code to try and get the elements (tile, h1, some other bits which I haven't added yet) to the code. The issue I am running into is I think the href returns an email, then the code tries and extracts the elements from this so obviously this bugs out. I have tried to avoid it picking the email (which i also thought would be in the def_valid function) but i am obviously missing something. Any help would be really appreciated.
import re
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW
internal_urls = set()
external_urls = set()
title_urls = set()
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
# is_internal_link == True:
title_check = soup.find_all('title')
if title_check != " " or title_check != None:
get_title(url)
get_heading_tags(url)
for a_tag in soup.findAll("a"):
# is_internal_link = False
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
#print(f"{GRAY}[!] External link: {href}{RESET}")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET}")
if re.search('#',href) == True:
continue
urls.add(href)
internal_urls.add(href)
return urls
# number of urls visited so far will be stored here
total_urls_visited = 0
def get_title(url): # domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
#print("Title of the website is : ")
for title in soup.find_all('title'):
if title == "" and title == None:
continue
title_text = title.get_text()
title_urls.add(title_text)
print(title_text)
print((len(title_text)))
def get_heading_tags(url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
heading_tags = ['h1', 'h2', 'h3']
i = 0
for tags in soup.find_all(heading_tags):
if tags == " " or tags == None:
continue
tags_text = tags.get_text()
letters_in_tags = len(tags_text) - tags_text.count(" ")
i += 1
print(f'{tags.name} {i} -> {tags_text} -> Length ->{letters_in_tags} ')
def crawl(url, max_urls=80):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
print(f"{YELLOW}[*] Crawling: {url}{RESET}")
links = get_all_website_links(url)
for link in links:
if re.search('#',link) != True:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
if __name__ == "__main__":
crawl("https://website.com/") #put website here.
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total External links:", len(external_urls))
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
for link in links:
if re.search('#',link) != True:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
You are only checking if # is present in the link (and that too not Correct!) to know if it's an email or not. Also note that links can also have # in them.
Basically, emails inside <a> will be of the form:
So to differentiate emails from links, you can use the below check.
for link in links:
if not link.startswith('mailto:'):
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
This will ignore all the emails and only scrape links.
I am a beginner in web scraping, and I need help with this problem.
The website, allrecipes.com, is a website where you can find recipes based on a search, which in this case is 'pie':
link to the html file:
'view-source:https://www.allrecipes.com/search/results/?wt=pie&sort=re'
(right click-> view page source)
I want to create a program that takes a input, searches it up on allrecipes, and returns a list with tuples of the first five recipes with data such as the time that takes to make, serving yield, ingrediants, and more.
This is my program so far:
import requests
from bs4 import BeautifulSoup
def searchdata():
inp=input('what recipe would you like to search')
url ='http://www.allrecipes.com/search/results/?wt='+str(inp)+'&sort=re'
r=requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links=[]
#fill in code for finding top 3 or five links
for i in range(3)
a = requests.get(links[i])
soupa = BeautifulSoup(a.text, 'html.parser')
#fill in code to find name, ingrediants, time, and serving size with data from soupa
names=[]
time=[]
servings=[]
ratings=[]
ingrediants=[]
searchdata()
Yes, i know, my code is very messy but What should I fill in in the two code fill-in areas?
Thanks
After searching for the recipe you have to get the links of each recipe and then request again for each of those links, because the information you're looking for is not available on the search page. That would not look clean without OOP so here's the class I wrote that does what you want.
import requests
from time import sleep
from bs4 import BeautifulSoup
class Scraper:
links = []
names = []
def get_url(self, url):
url = requests.get(url)
self.soup = BeautifulSoup(url.content, 'html.parser')
def print_info(self, name):
self.get_url(f'https://www.allrecipes.com/search/results/?wt={name}&sort=re')
if self.soup.find('span', class_='subtext').text.strip()[0] == '0':
print(f'No recipes found for {name}')
return
results = self.soup.find('section', id='fixedGridSection')
articles = results.find_all('article')
texts = []
for article in articles:
txt = article.find('h3', class_='fixed-recipe-card__h3')
if txt:
if len(texts) < 5:
texts.append(txt)
else:
break
self.links = [txt.a['href'] for txt in texts]
self.names = [txt.a.span.text for txt in texts]
self.get_data()
def get_data(self):
for i, link in enumerate(self.links):
self.get_url(link)
print('-' * 4 + self.names[i] + '-' * 4)
info_names = [div.text.strip() for div in self.soup.find_all(
'div', class_='recipe-meta-item-header')]
ingredient_spans = self.soup.find_all('span', class_='ingredients-item-name')
ingredients = [span.text.strip() for span in ingredient_spans]
for i, div in enumerate(self.soup.find_all('div', class_='recipe-meta-item-body')):
print(info_names[i].capitalize(), div.text.strip())
print()
print('Ingredients'.center(len(ingredients[0]), ' '))
print('\n'.join(ingredients))
print()
print('*' * 50, end='\n\n')
chrome = Scraper()
chrome.print_info(input('What recipe would you like to search: '))
I am trying to extract some information about mtg cards from a webpage with the following program but I repeatedly retrieve information about the initial page given(InitUrl). The crawler is unable to proceed further. I have started to believe that i am not using the correct urls or maybe there is a restriction in using urllib that slipped my attention. Here is the code that i struggle with for weeks now:
import re
from math import ceil
from urllib.request import urlopen as uReq, Request
from bs4 import BeautifulSoup as soup
InitUrl = "https://mtgsingles.gr/search?q=dragon"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 4 # depth of pages to be retrieved
query = InitUrl.split("?")[1]
for i in range(0, NumOfPages):
if i == 0:
Url = InitUrl
else:
Url = URL_Next
print(Url)
UClient = uReq(Url) # downloading the url
page_html = UClient.read()
UClient.close()
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
print(card_name + "\n" + cardP_T + "\n" + cardType + "\n")
try:
URL_Next = InitUrl + "&page=" + str(i + 2)
print("The next URL is: " + URL_Next + "\n")
except IndexError:
print("Crawling process completed! No more infomation to retrieve!")
else:
NumOfCrawledPages += 1
Url = URL_Next
finally:
print("Moving to page : " + str(NumOfCrawledPages + 1) + "\n")
One of the reasons your code fail is, that you don't use cookies. The site seem to require these to allow paging.
A clean and simple way of extracting the data you're interested in would be like this:
import requests
from bs4 import BeautifulSoup
# the site actually uses this url under the hood for paging - check out Google Dev Tools
paging_url = "https://mtgsingles.gr/search?ajax=products-listing&lang=en&page={}&q=dragon"
return_list = []
# the page-scroll will only work when we support cookies
# so we fetch the page in a session
session = requests.Session()
session.get("https://mtgsingles.gr/")
All pages have a next button except the last one. So we use this knowledge to loop until the next-button goes away. When it does - meaning that the last page is reached - the button is replaced with a 'li'-tag with the class of 'next hidden'. This only exists on the last page
Now we're ready to start looping
page = 1 # set count for start page
keep_paging = True # use flag to end loop when last page is reached
while keep_paging:
print("[*] Extracting data for page {}".format(page))
r = session.get(paging_url.format(page))
soup = BeautifulSoup(r.text, "html.parser")
items = soup.select('.iso-item.item-row-view.clearfix')
for item in items:
name = item.find('div', class_='col-md-10').get_text().strip().split('\xa0')[0]
toughness_element = item.find('div', class_='card-power-toughness')
try:
toughness = toughness_element.get_text().strip()
except:
toughness = None
cardtype = item.find('div', class_='cardtype').get_text()
card_dict = {
"name": name,
"toughness": toughness,
"cardtype": cardtype
}
return_list.append(card_dict)
if soup.select('li.next.hidden'): # this element only exists if the last page is reached
keep_paging = False
print("[*] Scraper is done. Quitting...")
else:
page += 1
# do stuff with your list of dicts - e.g. load it into pandas and save it to a spreadsheet
This will scroll until no more pages exists - no matter how many subpages would be in the site.
My point in the comment above was merely that if you encounter an Exception in your code, your pagecount would never increase. That's probably not what you want to do, which is why I recommended you to learn a little more about the behaviour of the whole try-except-else-finally deal.
I am also bluffed, by the request given the same reply, ignoring the page parameter. As a dirty soulution I can offer you first to set up the page-size to a high enough number to get all the Items that you want (this parameter works for some reason...)
import re
from math import ceil
import requests
from bs4 import BeautifulSoup as soup
InitUrl = Url = "https://mtgsingles.gr/search"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 2 # depth of pages to be retrieved
query = "dragon"
cardSet=set()
for i in range(1, NumOfPages):
page_html = requests.get(InitUrl,params={"page":i,"q":query,"page-size":999})
print(page_html.url)
page_soup = soup(page_html.text, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
cardString=card_name + "\n" + cardP_T + "\n" + cardType + "\n"
cardSet.add(cardString)
print(cardString)
NumOfCrawledPages += 1
print("Moving to page : " + str(NumOfCrawledPages + 1) + " with " +str(len(cards)) +"(cards)\n")
I scraping a site with Beautiful Soup. The problem I have is that certain parts of the site are paginated with JS, with an unknown (varying) number of pages to scrape.
I'm trying to get around this with a generator, but it's my first time writing one and I'm having a hard time wrapping my head around it and figuring out if what I'm doing makes sense.
Code:
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
tlds = csv.reader(open("top_level_domains.csv", 'r'), delimiter=';')
sites = csv.writer(open("websites_to_scrape.csv", "w"), delimiter=',')
tld = "uz"
has_next = True
page = 0
def create_link(tld, page):
if page == 0:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain"
else:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain/page/" + repr(page)
return link
def check_for_next(soup):
disabled_nav = soup.find(class_="pagingDivDisabled")
if disabled_nav:
if "Next" in disabled_nav:
return False
else:
return True
else:
return True
def make_soup(link):
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
return soup
def all_the_pages(counter):
while True:
link = create_link(tld, counter)
soup = make_soup(link)
if check_for_next(soup) == True:
yield counter
else:
break
counter += 1
def scrape_page(soup):
table = soup.find('table', {'class': 'rankTable'})
th = table.find('tbody')
test = th.find_all("td")
correct_cells = range(1,len(test),3)
for cell in correct_cells:
#print test[cell]
url = repr(test[cell])
content = re.sub("<[^>]*>", "", url)
sites.writerow([tld]+[content])
def main():
for page in all_the_pages(0):
print page
link = create_link(tld, page)
print link
soup = make_soup(link)
scrape_page(soup)
main()
My thinking behind the code:
The scraper should get the page, determine if there is another page that follows, scrape the current page and move to the next one, repreating the process. If there is no next page, it should stop. Does that make sense how I'm going it here?
As I told you, you could use selenium for programmatically clicking on the Next button, but since that is not an option for you, I can think of the following method to get the number of pages using pure BS4:
import requests
from bs4 import BeautifulSoup
def page_count():
pages = 1
url = "https://domaintyper.com/top-websites/most-popular-websites-with-uz-domain/page/{}"
while True:
html = requests.get(url.format(pages)).content
soup = BeautifulSoup(html)
table = soup.find('table', {'class': 'rankTable'})
if len(table.find_all('tr')) <= 1:
return pages
pages += 1