There is my code
def parser():
flag = True
url = 'https://quotes.toscrape.com'
while flag:
responce = requests.get(url)
soup = BeautifulSoup(responce.text, 'html.parser')
quote_l = soup.find_all('span', {'class': 'text'})
q_count = 0
for i in range(len(quote_l)):
if q_count >= 5:
flag = False
break
quote = soup.find_all('span', {'class': 'text'})[i]
if not Quote.objects.filter(quote=quote.string).exists():
author = soup.find_all('small', {'class': 'author'})[i]
if not Author.objects.filter(name=author.string).exists():
a = Author.objects.create(name=author.string)
Quote.objects.create(quote=quote.string, author_id=a.id)
q_count += 1
else:
a = Author.objects.get(name=author.string)
Quote.objects.create(quote=quote.string, author_id=a.id)
q_count += 1
url += soup.find('li', {'class': 'next'}).a['href']
I need to get the next page but I have this Exc. 'NoneType' object has no attribute 'a'
How to fix that and maybe how I can optimize my Code.Thx
Upon reaching the last page there will be no Next button so you need an exit condition check prior to attempting to access the href for next page. One possibility would be to add the following lines before your current last line:
next_page = soup.find('li', {'class': 'next'})
if not next_page: flag = False # or return
Or simply return at that point.
You'd also update the last line to use the variable, of course, and ensure you are not continuously extending url with suffixes of next page. For example, one could add the suffix during the requests call:
def parser():
flag = True
url = 'https://quotes.toscrape.com'
suffix = ''
while flag:
responce = requests.get(url + suffix)
soup = BeautifulSoup(responce.text, 'html.parser')
# other code
next_page = soup.find('li', {'class': 'next'})
if not next_page:
return
suffix = next_page.a['href']
Related
I am trying to hack together code that loops through a few URLs and grabs a few data points from each URL. Here is my super-hackey code.
import requests
from bs4 import BeautifulSoup
base_url = "https://www.amazon.com/s?k=mountain+bikes&ref=nb_sb_noss_"
current_page = 1
while current_page < 5:
print(current_page)
url = base_url + str(current_page)
#current_page += 1
r = requests.get(url)
zute_soup = BeautifulSoup(r.text, 'html.parser')
firme = zute_soup.findAll('div', {'class': 'brand-follow-tooltip-root'})
title = []
desc = []
page = []
for title in firme:
title1 = title.findAll('h1')[0].text
print(title1)
adresa = title.findAll('div', {'class': 'brand-follow-tooltip-root'})[0].text
print(adresa)
print('\n')
page_line = "{title1}\n{adresa}".format(
title1=title1,
adresa=adresa
)
title.append(title1)
desc.append(adresa)
page.append(page_line)
current_page += 1
The code finishes in a few seconds and I get no errors, but nothing in appended to any of the lists. I think this is close, but I don't what what the issue is here.
For every iteration you are nullying them, is this expected ?
while current_page < 5:
.
.
.
title = []
desc = []
page = []
.
.
.
title.append(title1)
desc.append(adresa)
page.append(page_line)
current_page += 1
Move
title = []
desc = []
page = []
out of while loop. And your appendeds wont be nullified.
I am building a scraper for Ebay. I am trying to figure out a way to manipulate the page number portion of the Ebay url to go to the next page until there are no more pages (If you were on page 2 the page number portion would look like "_pgn=2"). I noticed that if you put any number greater than the max number of pages a listing has, the page will reload to the last page, not give like a page doesn't exist error. (If a listing has 5 pages, then the last listing' page number url portion of _pgn=5 would rout to the same page if the page number url portion was _pgn=100). How can I implement a way to start at page one, get the html soup of the page, get the all relevant data I want from the soup, then load up the next page with the new page number and start the process again until there are not any new pages to scrape? I tried to get the number of results a listing has by using selenium xpath and math.ceil the quotient of number of results and 50 (default number of max listings per page) and use that quotient as my max_page, but I get errors saying the element doesn't exist even though it does. self.driver.findxpath('xpath').text. That 243 is what I am trying to get with the xpath.
class EbayScraper(object):
def __init__(self, item, buying_type):
self.base_url = "https://www.ebay.com/sch/i.html?_nkw="
self.driver = webdriver.Chrome(r"chromedriver.exe")
self.item = item
self.buying_type = buying_type + "=1"
self.url_seperator = "&_sop=12&rt=nc&LH_"
self.url_seperator2 = "&_pgn="
self.page_num = "1"
def getPageUrl(self):
if self.buying_type == "Buy It Now=1":
self.buying_type = "BIN=1"
self.item = self.item.replace(" ", "+")
url = self.base_url + self.item + self.url_seperator + self.buying_type + self.url_seperator2 + self.page_num
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def getInfo(self, soup):
for listing in soup.find_all("li", {"class": "s-item"}):
raw = listing.find_all("a", {"class": "s-item__link"})
if raw:
raw_price = listing.find_all("span", {"class": "s-item__price"})[0]
raw_title = listing.find_all("h3", {"class": "s-item__title"})[0]
raw_link = listing.find_all("a", {"class": "s-item__link"})[0]
raw_condition = listing.find_all("span", {"class": "SECONDARY_INFO"})[0]
condition = raw_condition.text
price = float(raw_price.text[1:])
title = raw_title.text
link = raw_link['href']
print(title)
print(condition)
print(price)
if self.buying_type != "BIN=1":
raw_time_left = listing.find_all("span", {"class": "s-item__time-left"})[0]
time_left = raw_time_left.text[:-4]
print(time_left)
print(link)
print('\n')
if __name__ == '__main__':
item = input("Item: ")
buying_type = input("Buying Type (e.g, 'Buy It Now' or 'Auction'): ")
instance = EbayScraper(item, buying_type)
page = instance.getPageUrl()
instance.getInfo(page)
if you want to iterate all pages and gather all results then your script needs to check if there is a next page after you visit the page
import requests
from bs4 import BeautifulSoup
class EbayScraper(object):
def __init__(self, item, buying_type):
...
self.currentPage = 1
def get_url(self, page=1):
if self.buying_type == "Buy It Now=1":
self.buying_type = "BIN=1"
self.item = self.item.replace(" ", "+")
# _ipg=200 means that expect a 200 items per page
return '{}{}{}{}{}{}&_ipg=200'.format(
self.base_url, self.item, self.url_seperator, self.buying_type,
self.url_seperator2, page
)
def page_has_next(self, soup):
container = soup.find('ol', 'x-pagination__ol')
currentPage = container.find('li', 'x-pagination__li--selected')
next_sibling = currentPage.next_sibling
if next_sibling is None:
print(container)
return next_sibling is not None
def iterate_page(self):
# this will loop if there are more pages otherwise end
while True:
page = instance.getPageUrl(self.currentPage)
instance.getInfo(page)
if self.page_has_next(page) is False:
break
else:
self.currentPage += 1
def getPageUrl(self, pageNum):
url = self.get_url(pageNum)
print('page: ', url)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def getInfo(self, soup):
...
if __name__ == '__main__':
item = input("Item: ")
buying_type = input("Buying Type (e.g, 'Buy It Now' or 'Auction'): ")
instance = EbayScraper(item, buying_type)
instance.iterate_page()
the important functions here are page_has_next and iterate_page
page_has_next - a function that check if the pagination of the page has another li element next to the selected page. e.g < 1 2 3 > if we are on page 1 then it checks if there is 2 next -> something like this
iterate_page - a function that loop until there is no page_next
also note that you don't need selenium for this unless you need to mimic user clicks or need a browser to navigate.
I developed this program to scrape newegg for ps4 prices. However I want to scrape multiple pages. Here is what I have but once it scrapes the first page the program stops. Basically I am trying to change the link so 'pages-1' changes to 2,3,4 etc. Is there a better way to do this?
from bs4 import BeautifulSoup
import requests
import csv
page_num = 1
prod_num = 0
source = requests.get('https://www.newegg.com/PS4-Systems/SubCategory/ID-3102/Page-' + str(page_num) + '?PageSize=36&order=BESTMATCH').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('newegg_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product', 'Price', 'Shipping_info'])
for info in soup.find_all('div', class_='item-container'):
prod = info.find('a', class_='item-title').text.strip()
price = info.find('li', class_='price-current').text.strip().splitlines()[1].replace(u'\xa0', '')
if u'$' not in price:
price = info.find('li', class_='price-current').text.strip().splitlines()[0].replace(u'\xa0', '')
ship = info.find('li', class_='price-ship').text.strip()
print(prod)
print(price)
print(ship)
csv_writer.writerow([prod, price, ship])
prod_num += 1
if prod_num > 35: #there is about 35 items per newegg page
page_num += 1
# print(price.splitlines()[1])
print('-----------')
csv_file.close()
i found the page limit num here
and i think you can get the page limit by xpath or other ways:
# xpath syntax may like this
# //span[#class='list-tool-pagination-text']
hope it's useful for you
If you noticed, Next "button" tag of last page has attribute "disabled", So [tag_name].has_attr('disabled') return True . Using this you can manage pagination.
import requests
from bs4 import BeautifulSoup
import csv
csv_file = open('newegg_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product', 'Price', 'Shipping_info'])
URL_PART1 = "https://www.newegg.com/PS4-Systems/SubCategory/ID-3102/Page-"
URL_PART2 = "?PageSize=36&order=BESTMATCH"
PAGE_NO = 1
url = URL_PART1 + str(PAGE_NO) + URL_PART2
while len(url):
PAGE_NO+=1
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')
all_divs = soup.find_all('div', attrs={'class':'item-info'})
for item in all_divs:
prod = ""
price = ""
ship = ""
# get product name
prod = item.find('a', attrs={'class':'item-title'})
if prod:
prod = prod.text.strip()
# get price
price_part = item.find('li', attrs={'class':'price-current'})
if price_part:
price_part1 = price_part.strong
if price_part1:
price_part1 = price_part1.text.strip()
price_part2 = price_part.sup
if price_part2:
price_part2 = price_part2.text.strip()
if price_part1 and price_part2:
price = price_part1 + price_part2
# get shipping info
ship = item.find('li', attrs={'class':'price-ship'})
if ship:
ship = ship.text.strip()
csv_writer.writerow([prod, price, ship])
# manage pagination
next_button = soup.find('button', attrs={'title': 'Next'})
if not(next_button.has_attr('disabled')):
url = URL_PART1 + str(PAGE_NO) + URL_PART2
else:
url = ""
I want to return multiple links in python, but can't figure out how. If I print link_hrefI get all of the links, but when I return, I get only the first link and the app quits. Can anyone help me please?
def main():
def get_links():
offset = 0
while int(offset) < 990:
url = f"https://krmeni.cz/kniha?offset={str(offset)}"
page_content = requests.get(url)
soup = BeautifulSoup(page_content.text, "html.parser")
file_path = "chatbot_data.csv"
offset += 10
for link in soup.find_all('a', {'class': 'white-btn'}):
title = link.string
link_href = link.get("href")
if link.string == "přidat odpověď":
continue
else:
return link_href
for link_href in get_links():
answer_url = f"https://krmeni.cz{get_links()}"
print(answer_url)
Your code exits on the fist if and else statement.
if link.string == "přidat odpověď":
continue
else:
return link_href
Initialize a list before the for loop and append link_href in else statement. The after completion of execution of for loop, return the list.Like this.
link_list = []
for link in soup.find_all('a', {'class': 'white-btn'}):
title = link.string
link_href = link.get("href")
if link.string == "přidat odpověď":
continue
else:
link_list.append(link_href)
return link_list
Or make a generator,
for link in soup.find_all('a', {'class': 'white-btn'}):
title = link.string
link_href = link.get("href")
if link.string == "přidat odpověď":
continue
else:
yield link_href
Your loop exits after finding the first link. Use a list comprehension:
return [link.get('href')
for link in soup.find_all('a', {'class': 'white-btn'})
if link.string == 'pridat odpoved']
This returns a list containing the links you need.
just use generator function yield instead of return
def main():
def get_links():
offset = 0
while int(offset) < 990:
url = f"https://krmeni.cz/kniha?offset={str(offset)}"
page_content = requests.get(url)
soup = BeautifulSoup(page_content.text, "html.parser")
file_path = "chatbot_data.csv"
offset += 10
for link in soup.find_all('a', {'class': 'white-btn'}):
title = link.string
link_href = link.get("href")
if link.string == "přidat odpověď":
continue
else:
yield link_href
for link_href in get_links():
answer_url = f"https://krmeni.cz{get_links()}"
print(answer_url)
I'm trying to make my actual crawler Multithread.
When I set the Multithread, several instance of the function will be started.
Exemple :
If my function I use print range(5) and I will have 1,1,2,2,3,3,4,4,5,5 if I have 2 Thread.
How can can I have the result 1,2,3,4,5 in Multithread ?
My actual code is a crawler as you can see under :
import requests
from bs4 import BeautifulSoup
def trade_spider(max_pages):
page = 1
while page <= max_pages:
url = "http://stackoverflow.com/questions?page=" + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('a', {'class': 'question-hyperlink'}):
href = link.get('href')
title = link.string
print(title)
get_single_item_data("http://stackoverflow.com/" + href)
page += 1
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
res = soup.find('span', {'class': 'vote-count-post '})
print("UpVote : " + res.string)
trade_spider(1)
How can I call trade_spider() in Multithread without duplicate link ?
Have the page number be an argument to the trade_spider function.
Call the function in each process with a different page number so that each thread gets a unique page.
For example:
import multiprocessing
def trade_spider(page):
url = "http://stackoverflow.com/questions?page=%s" % (page,)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('a', {'class': 'question-hyperlink'}):
href = link.get('href')
title = link.string
print(title)
get_single_item_data("http://stackoverflow.com/" + href)
# Pool of 10 processes
max_pages = 100
num_pages = range(1, max_pages)
pool = multiprocessing.Pool(10)
# Run and wait for completion.
# pool.map returns results from the trade_spider
# function call but that returns nothing
# so ignoring it
pool.map(trade_spider, num_pages)
Try this:
from multiprocessing import Process, Value
import time
max_pages = 100
shared_page = Value('i', 1)
arg_list = (max_pages, shared_page)
process_list = list()
for x in range(2):
spider_process = Process(target=trade_spider, args=arg_list)
spider_process.daemon = True
spider_process.start()
process_list.append(spider_process)
for spider_process in process_list:
while spider_process.is_alive():
time.sleep(1.0)
spider_process.join()
Change the parameter list of trade_spider to
def trade_spider(max_pages, page)
and remove the
page = 1
This will create two processes that will work through the page list by sharing the page value.