How to update scraping link in for loop - python

I developed this program to scrape newegg for ps4 prices. However I want to scrape multiple pages. Here is what I have but once it scrapes the first page the program stops. Basically I am trying to change the link so 'pages-1' changes to 2,3,4 etc. Is there a better way to do this?
from bs4 import BeautifulSoup
import requests
import csv
page_num = 1
prod_num = 0
source = requests.get('https://www.newegg.com/PS4-Systems/SubCategory/ID-3102/Page-' + str(page_num) + '?PageSize=36&order=BESTMATCH').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('newegg_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product', 'Price', 'Shipping_info'])
for info in soup.find_all('div', class_='item-container'):
prod = info.find('a', class_='item-title').text.strip()
price = info.find('li', class_='price-current').text.strip().splitlines()[1].replace(u'\xa0', '')
if u'$' not in price:
price = info.find('li', class_='price-current').text.strip().splitlines()[0].replace(u'\xa0', '')
ship = info.find('li', class_='price-ship').text.strip()
print(prod)
print(price)
print(ship)
csv_writer.writerow([prod, price, ship])
prod_num += 1
if prod_num > 35: #there is about 35 items per newegg page
page_num += 1
# print(price.splitlines()[1])
print('-----------')
csv_file.close()

i found the page limit num here
and i think you can get the page limit by xpath or other ways:
# xpath syntax may like this
# //span[#class='list-tool-pagination-text']
hope it's useful for you

If you noticed, Next "button" tag of last page has attribute "disabled", So [tag_name].has_attr('disabled') return True . Using this you can manage pagination.
import requests
from bs4 import BeautifulSoup
import csv
csv_file = open('newegg_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product', 'Price', 'Shipping_info'])
URL_PART1 = "https://www.newegg.com/PS4-Systems/SubCategory/ID-3102/Page-"
URL_PART2 = "?PageSize=36&order=BESTMATCH"
PAGE_NO = 1
url = URL_PART1 + str(PAGE_NO) + URL_PART2
while len(url):
PAGE_NO+=1
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')
all_divs = soup.find_all('div', attrs={'class':'item-info'})
for item in all_divs:
prod = ""
price = ""
ship = ""
# get product name
prod = item.find('a', attrs={'class':'item-title'})
if prod:
prod = prod.text.strip()
# get price
price_part = item.find('li', attrs={'class':'price-current'})
if price_part:
price_part1 = price_part.strong
if price_part1:
price_part1 = price_part1.text.strip()
price_part2 = price_part.sup
if price_part2:
price_part2 = price_part2.text.strip()
if price_part1 and price_part2:
price = price_part1 + price_part2
# get shipping info
ship = item.find('li', attrs={'class':'price-ship'})
if ship:
ship = ship.text.strip()
csv_writer.writerow([prod, price, ship])
# manage pagination
next_button = soup.find('button', attrs={'title': 'Next'})
if not(next_button.has_attr('disabled')):
url = URL_PART1 + str(PAGE_NO) + URL_PART2
else:
url = ""

Related

how to improve my web scraping code by using multithreading?

This is my code. It is web scraping page by page and extracting the data to Excel. It is taking the next page link by extracting the anchor tag present in pagination of the current page.
Currently it is slow; can someone please help to make it fast by using multithreading or anything else?
import requests
from urllib3.exceptions import InsecureRequestWarning
import csv
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
f = csv.writer(open('GEM.csv', 'w', newline=''))
f.writerow(['Bidnumber', 'Items', 'Quantitiy', 'Department', 'Enddate','pageNumber'])
def scrap_bid_data():
page_no = 1
url = ""
while page_no <= 532:
print('Hold on creating URL to fetch data for...'+str(page_no))
if page_no == 2:
url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + "AMCR24yMNFkfoXF3wKPmGMy_wV8TJPAlxm6oWiTHGOI"
if page_no == 1:
url = 'https://bidplus.gem.gov.in/bidlists?bidlists'
print('URL created: ' + url)
scraped_data = requests.get(url, verify=False)
soup_data = bs(scraped_data.text, 'lxml')
nextlink = soup_data.find('a', {'rel': 'next'})
nxt = nextlink['href'].split('=')[1]
extracted_data = soup_data.find('div', {'id': 'pagi_content'})
if len(extracted_data) == 0:
break
else:
for idx in range(len(extracted_data)):
if (idx % 2 == 1):
bid_data = extracted_data.contents[idx].text.strip().split('\n')
if (len(bid_data) > 1):
print(page_no)
if (len(bid_data[8]) > 1 and len(bid_data[10].split(':')) > 1):
bidno = bid_data[0].split(":")[-1]
items = bid_data[9].strip().split('Items:')[-1]
qnty = int(bid_data[10].split(':')[1].strip())
dept = (bid_data[11] + bid_data[16].strip()).split(":")[-1]
edate = bid_data[21].split("End Date:")[-1]
f.writerow([bidno, items, qnty, dept, edate,page_no])
page_no=page_no+1
url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' +nxt
print('printing the next url')
print(url)
scrap_bid_data()

Trying to loop through multiple URLs and import some data rom each

I am trying to hack together code that loops through a few URLs and grabs a few data points from each URL. Here is my super-hackey code.
import requests
from bs4 import BeautifulSoup
base_url = "https://www.amazon.com/s?k=mountain+bikes&ref=nb_sb_noss_"
current_page = 1
while current_page < 5:
print(current_page)
url = base_url + str(current_page)
#current_page += 1
r = requests.get(url)
zute_soup = BeautifulSoup(r.text, 'html.parser')
firme = zute_soup.findAll('div', {'class': 'brand-follow-tooltip-root'})
title = []
desc = []
page = []
for title in firme:
title1 = title.findAll('h1')[0].text
print(title1)
adresa = title.findAll('div', {'class': 'brand-follow-tooltip-root'})[0].text
print(adresa)
print('\n')
page_line = "{title1}\n{adresa}".format(
title1=title1,
adresa=adresa
)
title.append(title1)
desc.append(adresa)
page.append(page_line)
current_page += 1
The code finishes in a few seconds and I get no errors, but nothing in appended to any of the lists. I think this is close, but I don't what what the issue is here.
For every iteration you are nullying them, is this expected ?
while current_page < 5:
.
.
.
title = []
desc = []
page = []
.
.
.
title.append(title1)
desc.append(adresa)
page.append(page_line)
current_page += 1
Move
title = []
desc = []
page = []
out of while loop. And your appendeds wont be nullified.

Page loop not working on python webscrape

I'm quite new to python and have written a script using beautifulsoup to parse a website table. I've tried everything but can't get the loop to cycle through pages. It currently just repeats the data on the first page 8 times (number of pages).
Can anyone please help?
Code:
import requests
from bs4 import BeautifulSoup
first_year = 2020
last_year = 2020
for i in range(last_year-first_year+1):
year = str(first_year + i)
print("Running for year:", year)
text = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID="+year).text
soup = BeautifulSoup(text, "html.parser")
options = soup.findAll("option")
opts = []
for option in options:
if not option['value'].startswith("20") and not option['value'].startswith("19") and option["value"]:
opts.append({option["value"]: option.contents[0]})
for opt in opts:
for key, value in opt.items():
print("Doing option:", value)
text = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID=" + year + "&Round=" + key).text
pages_soup = BeautifulSoup(text, "html.parser")
p = pages_soup.findAll("a")
pages = 8
if "&Page=" in str(p[-2]):
pages = int(p[-2].contents[0])
for j in range(pages):
print("Page {}/{}".format(str(j+1), str(pages)))
parse = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID={}&Round={}&Page={}".format(year, key, j+1)).text
p_soup = BeautifulSoup(text, "html.parser")
tbody = pages_soup.findAll("tbody")
tbody_soup = BeautifulSoup(str(tbody), "html.parser")
tr = tbody_soup.findAll("tr")
for t in tr:
t = str(t).replace("</tr>", "").replace("<tr>", "").replace("amp;", "")
t = t[4:len(t)-5].split('</td><td>')
t.append(str(j+1))
t.append(str(value))
t.append(str(year))
open("output.csv", "a").write("\n" + ";".join(t))
Thankyou.
Try this..
for j in range(pages):
print("Page {}/{}".format(str(j+1), str(pages)))
parse = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID={}&Round={}&Page={}".format(year, key, j+1)).text
p_soup = BeautifulSoup(parse, "html.parser")
tbody = p_soup.findAll("tbody")
tbody_soup = BeautifulSoup(str(tbody), "html.parser")
tr = tbody_soup.findAll("tr")
for t in tr:
t = str(t).replace("</tr>", "").replace("<tr>", "").replace("amp;", "")
t = t[4:len(t)-5].split('</td><td>')
t.append(str(j+1))
t.append(str(value))
t.append(str(year))
open("output.csv", "a").write("\n" + ";".join(t))

How can I crawl all the page?

I am trying to crawl sites's text. But It's only crawling 12 articles.
I don't know why does it do like that. and I wondering If I wanna crawl other pages, What should I do?
import requests
from bs4 import BeautifulSoup
x = int(input("start page:"))
while x < int(input("end page:")):
x = x + 1
url = "https://www.mmtimes.com/national-news.html?page=" + str(x)
result = requests.get(url)
bs_obj = BeautifulSoup(result.content, "html.parser")
content = bs_obj.find("div", {"class": "msp-three-col"})
read_more = content.findAll("div", {"class": "read-more"})
for item in read_more:
atag = item.find('a')
link = "https://www.mmtimes.com" + atag["href"]
linkResult = requests.get(link)
subpage = BeautifulSoup(linkResult.content, "html.parser")
fnresult = subpage.find("div", {"class": "field-item even"})
print(fnresult.text)
print("Total "+str(len(read_more))+" articles"))
Check out the below code, I have made some changes. this will result the required output.
import requests
from bs4 import BeautifulSoup
x = int(input("start page:"))
y = input("end page:")
article_count = 0
while x <= int(y):
url = "https://www.mmtimes.com/national-news.html?page=" + str(x)
result = requests.get(url)
bs_obj = BeautifulSoup(result.content, "html.parser")
content = bs_obj.find("div", {"class": "msp-three-col"})
read_more = content.findAll("div", {"class": "read-more"})
for item in read_more:
atag = item.find('a')
link = "https://www.mmtimes.com" + atag["href"]
linkResult = requests.get(link)
subpage = BeautifulSoup(linkResult.content, "html.parser")
fnresult = subpage.find("div", {"class": "field-item even"})
print(fnresult.text)
article_count += len(read_more)
print("Total "+str(article_count)+" articles")
x += 1

Not getting second page data using python Request

I am trying to get movie reviews from Fandango website. Even when I hit the URL for the second page onwards of movie reviews for a particular movie I keep getting the first page. Do I need to send cookies with requests?
Below is my code snippet:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
baseUrl = 'https://www.fandango.com/movie-reviews'
req = Request(baseUrl, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
# Getting all the movie links from the first page
movieLinks = soup.find_all("a", class_='dark')
# Get reviews for every movie
for i in range(2):#len(movieLinks)
try:
movieName = movieLinks[i].text.replace(' Review', '')
count = 1
print('\n\n****** ' + movieName + ' ********\n\n')
# Getting movie reviews from first 10
for j in range(3):
pageNum = j + 1;
movieReviewUrl = movieLinks[i]['href'] + '?pn=' + str(pageNum)
print('Hitting URL: ' + movieReviewUrl)
revReq = Request(movieReviewUrl, headers = {'User-Agent': 'Mozilla/5.0'})
revWebpage = urlopen(revReq).read()
revSoup = BeautifulSoup(revWebpage, 'html.parser')
revArr = revSoup.find_all("p", class_ = "fan-reviews__item-content")
for k in range(len(revArr)):
if len(revArr[k])>0:
print(str(count) + ' : ' + revArr[k].text)
count = count + 1
except:
print('Error for movie: ' + movieName)
I suggest using Requests, it's much easier to handle such requests with it.
from bs4 import BeautifulSoup
import requests
baseUrl = 'https://www.fandango.com/movie-reviews'
# req = Request(baseUrl, headers={'User-Agent': 'Mozilla/5.0'})
webpage = requests.get(baseUrl).text
soup = BeautifulSoup(webpage, 'html.parser')
# Getting all the movie links from the first page
movieLinks = soup.find_all("a", class_='dark')
# Get reviews for every movie
for i in range(2):#len(movieLinks)
try:
movieName = movieLinks[i].text.replace(' Review', '')
count = 1
print('\n\n****** ' + movieName + ' ********\n\n')
# Getting movie reviews from first 10
for j in range(3):
pageNum = j + 1;
movieReviewUrl = movieLinks[i]['href'] + '?pn=' + str(pageNum)
print('Hitting URL: ' + movieReviewUrl)
# revReq = Request(movieReviewUrl, headers = {'User-Agent': 'Mozilla/5.0'})
# revWebpage = urlopen(revReq).read()
revWebpage = requests.get(movieReviewUrl).text
revSoup = BeautifulSoup(revWebpage, 'html.parser')
revArr = revSoup.find_all("p", class_ = "fan-reviews__item-content")
print(len(revArr))
for k in range(len(revArr)):
if len(revArr[k])>0:
print(str(count) + ' : ' + revArr[k].text)
count = count + 1
except:
print('Error for movie: ' + movieName)
When you run it, you can see that revArr is returning 0, so please check "fan-reviews__item-content".

Categories

Resources