how to improve my web scraping code by using multithreading?

how to improve my web scraping code by using multithreading? - python

This is my code. It is web scraping page by page and extracting the data to Excel. It is taking the next page link by extracting the anchor tag present in pagination of the current page.
Currently it is slow; can someone please help to make it fast by using multithreading or anything else?
import requests
from urllib3.exceptions import InsecureRequestWarning
import csv
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
f = csv.writer(open('GEM.csv', 'w', newline=''))
f.writerow(['Bidnumber', 'Items', 'Quantitiy', 'Department', 'Enddate','pageNumber'])
def scrap_bid_data():
page_no = 1
url = ""
while page_no <= 532:
print('Hold on creating URL to fetch data for...'+str(page_no))
if page_no == 2:
url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + "AMCR24yMNFkfoXF3wKPmGMy_wV8TJPAlxm6oWiTHGOI"
if page_no == 1:
url = 'https://bidplus.gem.gov.in/bidlists?bidlists'
print('URL created: ' + url)
scraped_data = requests.get(url, verify=False)
soup_data = bs(scraped_data.text, 'lxml')
nextlink = soup_data.find('a', {'rel': 'next'})
nxt = nextlink['href'].split('=')[1]
extracted_data = soup_data.find('div', {'id': 'pagi_content'})
if len(extracted_data) == 0:
break
else:
for idx in range(len(extracted_data)):
if (idx % 2 == 1):
bid_data = extracted_data.contents[idx].text.strip().split('\n')
if (len(bid_data) > 1):
print(page_no)
if (len(bid_data[8]) > 1 and len(bid_data[10].split(':')) > 1):
bidno = bid_data[0].split(":")[-1]
items = bid_data[9].strip().split('Items:')[-1]
qnty = int(bid_data[10].split(':')[1].strip())
dept = (bid_data[11] + bid_data[16].strip()).split(":")[-1]
edate = bid_data[21].split("End Date:")[-1]
f.writerow([bidno, items, qnty, dept, edate,page_no])
page_no=page_no+1
url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' +nxt
print('printing the next url')
print(url)
scrap_bid_data()

Related

how to export the Scraped data into excel with pre-defined header python?

currently i am printing the data.now rather than printing i want to export to
excel./csv new to python pls help.
**data is very huge around 9000 rows with 6 columns?**
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
def scrape_bid_data():
page_no = 1 #initial page number
while True:
print('Hold on creating URL to fetch data...')
URL = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no) #create dynamic URL
print('URL cerated: ' + URL)
scraped_data = requests.get(URL,verify=False) # request to get the data
soup_data = bs(scraped_data.text, 'lxml') #parse the scraped data using lxml
extracted_data = soup_data.find('div',{'id':'pagi_content'}) #find divs which contains required data
if len(extracted_data) == 0: # **if block** which will check the length of extracted_data if it is 0 then quit and stop the further execution of script.
break
else:
for idx in range(len(extracted_data)): # loops through all the divs and extract and print data
if(idx % 2 == 1): #get data from odd indexes only because we have required data on odd indexes
bid_data = extracted_data.contents[idx].text.strip().split('\n')
print('-' * 100)
print(bid_data[0]) #BID number
print(bid_data[5]) #Items
print(bid_data[6]) #Quantitiy Required
print(bid_data[10] + bid_data[12].strip()) #Department name and address
print(bid_data[16]) #Start date
print(bid_data[17]) #End date
print('-' * 100)
page_no +=1 #increments the page number by 1
scrape_bid_data()

I think you should start by returning the extract_data object containing your data at the end of your function.
page_no = 1
def scrap_bid_data(page):
print('Hold on creating URL to fetch data...')
URL = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page)
print('URL cerated: ' + URL)
scraped_data = requests.get(URL,verify=False) # request to get the data
soup_data = bs(scraped_data.text, 'lxml') #parse the scraped data using lxml
extracted_data = soup_data.find('div',{'id':'pagi_content'})
return extracted_data
Then use it to create a dataframe
extract_data = scrap_bid_data(page_no)
import pandas as pd
df = pd.DataFrame(extract_data)
and then export this fataframe.
df.to_csv ('file_name_{}'.format(page_no))

How to update scraping link in for loop

I developed this program to scrape newegg for ps4 prices. However I want to scrape multiple pages. Here is what I have but once it scrapes the first page the program stops. Basically I am trying to change the link so 'pages-1' changes to 2,3,4 etc. Is there a better way to do this?
from bs4 import BeautifulSoup
import requests
import csv
page_num = 1
prod_num = 0
source = requests.get('https://www.newegg.com/PS4-Systems/SubCategory/ID-3102/Page-' + str(page_num) + '?PageSize=36&order=BESTMATCH').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('newegg_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product', 'Price', 'Shipping_info'])
for info in soup.find_all('div', class_='item-container'):
prod = info.find('a', class_='item-title').text.strip()
price = info.find('li', class_='price-current').text.strip().splitlines()[1].replace(u'\xa0', '')
if u'$' not in price:
price = info.find('li', class_='price-current').text.strip().splitlines()[0].replace(u'\xa0', '')
ship = info.find('li', class_='price-ship').text.strip()
print(prod)
print(price)
print(ship)
csv_writer.writerow([prod, price, ship])
prod_num += 1
if prod_num > 35: #there is about 35 items per newegg page
page_num += 1
# print(price.splitlines()[1])
print('-----------')
csv_file.close()

i found the page limit num here
and i think you can get the page limit by xpath or other ways:
# xpath syntax may like this
# //span[#class='list-tool-pagination-text']
hope it's useful for you

If you noticed, Next "button" tag of last page has attribute "disabled", So [tag_name].has_attr('disabled') return True . Using this you can manage pagination.
import requests
from bs4 import BeautifulSoup
import csv
csv_file = open('newegg_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product', 'Price', 'Shipping_info'])
URL_PART1 = "https://www.newegg.com/PS4-Systems/SubCategory/ID-3102/Page-"
URL_PART2 = "?PageSize=36&order=BESTMATCH"
PAGE_NO = 1
url = URL_PART1 + str(PAGE_NO) + URL_PART2
while len(url):
PAGE_NO+=1
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')
all_divs = soup.find_all('div', attrs={'class':'item-info'})
for item in all_divs:
prod = ""
price = ""
ship = ""
# get product name
prod = item.find('a', attrs={'class':'item-title'})
if prod:
prod = prod.text.strip()
# get price
price_part = item.find('li', attrs={'class':'price-current'})
if price_part:
price_part1 = price_part.strong
if price_part1:
price_part1 = price_part1.text.strip()
price_part2 = price_part.sup
if price_part2:
price_part2 = price_part2.text.strip()
if price_part1 and price_part2:
price = price_part1 + price_part2
# get shipping info
ship = item.find('li', attrs={'class':'price-ship'})
if ship:
ship = ship.text.strip()
csv_writer.writerow([prod, price, ship])
# manage pagination
next_button = soup.find('button', attrs={'title': 'Next'})
if not(next_button.has_attr('disabled')):
url = URL_PART1 + str(PAGE_NO) + URL_PART2
else:
url = ""

I am using BeautifulSoup, how can I get the link after the redirect?

I want to get the link after the redirect of the download link in the article page.
For example:
https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/
In the above article page, there are the following download links:
https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/yz5cw79mbn3a/ECNHOgoNYk0MIkEoFlUkFlY5Vj5WVSRQACVKfx8EOw8ReVs+FFs=
Open this link directly, it will not redirect to the real download link, you need to open it in the article page.
# coding=utf-8
import lxml
import re
import requests
import sys
from bs4 import BeautifulSoup
from urllib.request import urlopen
def urlopen(url):
'''
using requests to replace urllib.requests.urlopen
return an html
'''
headers = {"User-Agent":"Mozilla/5.0"}
r = requests.get(url, headers=headers)
return r.text
def generate_pages(subTitle,fromPage,toPage):
'''
return page sites' url list
'''
pages = []
if(fromPage > 0 and fromPage<toPage):
for i in range(fromPage,toPage+1):
pages.append('https://scanlibs.com/category/books'+subTitle+'/page/'+str(i))
return pages
def get_book_sites_of_one_page(page):
'''
get book site's url in one page
input: page site url
output: book site urls list
return book sites in one page
'''
html = urlopen(page)
soup = BeautifulSoup(html,'html.parser')
linkList = soup.find('main').findAll('a',{'rel':'bookmark'})
bookSites= []
for link in linkList[::2]:
if 'href' in link.attrs:
#print(link)
bookSites.append(link.attrs['href'])
return bookSites
def get_book_urls(bookSite):
'''
input a book site
find book downloading urls in this book site
then
return them as a list
'''
bookURLs=[]
html = urlopen(bookSite)
soup = BeautifulSoup(html,'lxml')
linkList = soup.findAll("a",{"target":"_blank"})
for link in linkList[::2]:
# print(link)
if 'href' in link.attrs:
bookURLs.append(link.attrs['href'])
return bookURLs
def get_all_book_urls(fromPage=1, toPage=1, subTitle=''):
bookSites = []
bookURLs = []
pages = generate_pages(subTitle,fromPage, toPage)
for page in pages:
bookSiteOfOnePage=get_book_sites_of_one_page(page)
bookSites.extend(bookSiteOfOnePage)
for bookSite in bookSites:
book_urls=get_book_urls(bookSite)
bookURLs += book_urls
for bookURL in bookURLs:
print(bookURL)
#with open(filename, 'w') as f:
# f.write(bookURLs)
def main():
if(len(sys.argv) == 4):
'''
python getUrl.py 1, 100, programming
from page 1 to page in subject programming
'''
subTitle = str(sys.argv[3])
fromPage = int(sys.argv[1])
toPage = int(sys.argv[2])
get_all_book_urls(fromPage, toPage, subTitle)
if(len(sys.argv) == 3):
'''
python getUrl.py 1 100
from page 1 to page 100
'''
subTitle = ''
fromPage = int(sys.argv[1])
toPage = int(sys.argv[2])
#filename = subTitle="-"+str(pageNum)+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
elif(len(sys.argv) == 2):
'''
python getUrl.py 10
from page 10 to page 10
only download books on page 10
'''
fromPage = int(sys.argv[1])
toPage = fromPage + 1
subTitle = ''
#filename = "All-"+str(pageNum)+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
elif(len(sys.argv)== 1):
fromPage = 1
# custom page range
toPage = 2
subTitle = ''
#filename = "All-"+"1"+"-"+time.strftime('%Y-%m-%d', time.localtime())+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
else:
print("Error, too many arguments")
if __name__ == '__main__':
#filename = ''
main()
Thank you for your help!

This website checks if the referer is set while redirecting. You can just give the original url as referer in the header and easily bypass this. You can also see that the referer is used as a url parameter in the final download link.
import requests
from bs4 import BeautifulSoup
s = requests.Session()
url='https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/'
r=html=s.get(url).text
soup=BeautifulSoup(html,'html.parser')
relative_link=soup.find('a',{'id':'download'})['href'] #get the relative link
download_redirect_link=url+relative_link
headers={
"referer": url
}
r2=requests.get(download_redirect_link,headers=headers)
print(r2.url)
Output
https://rapidgator.net/file/80e881f7631eddb49de31e5718eb96ba?referer=https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/

Web crawler not able to process more than one webpage

I am trying to extract some information about mtg cards from a webpage with the following program but I repeatedly retrieve information about the initial page given(InitUrl). The crawler is unable to proceed further. I have started to believe that i am not using the correct urls or maybe there is a restriction in using urllib that slipped my attention. Here is the code that i struggle with for weeks now:
import re
from math import ceil
from urllib.request import urlopen as uReq, Request
from bs4 import BeautifulSoup as soup
InitUrl = "https://mtgsingles.gr/search?q=dragon"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 4 # depth of pages to be retrieved
query = InitUrl.split("?")[1]
for i in range(0, NumOfPages):
if i == 0:
Url = InitUrl
else:
Url = URL_Next
print(Url)
UClient = uReq(Url) # downloading the url
page_html = UClient.read()
UClient.close()
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
print(card_name + "\n" + cardP_T + "\n" + cardType + "\n")
try:
URL_Next = InitUrl + "&page=" + str(i + 2)
print("The next URL is: " + URL_Next + "\n")
except IndexError:
print("Crawling process completed! No more infomation to retrieve!")
else:
NumOfCrawledPages += 1
Url = URL_Next
finally:
print("Moving to page : " + str(NumOfCrawledPages + 1) + "\n")

One of the reasons your code fail is, that you don't use cookies. The site seem to require these to allow paging.
A clean and simple way of extracting the data you're interested in would be like this:
import requests
from bs4 import BeautifulSoup
# the site actually uses this url under the hood for paging - check out Google Dev Tools
paging_url = "https://mtgsingles.gr/search?ajax=products-listing&lang=en&page={}&q=dragon"
return_list = []
# the page-scroll will only work when we support cookies
# so we fetch the page in a session
session = requests.Session()
session.get("https://mtgsingles.gr/")
All pages have a next button except the last one. So we use this knowledge to loop until the next-button goes away. When it does - meaning that the last page is reached - the button is replaced with a 'li'-tag with the class of 'next hidden'. This only exists on the last page
Now we're ready to start looping
page = 1 # set count for start page
keep_paging = True # use flag to end loop when last page is reached
while keep_paging:
print("[*] Extracting data for page {}".format(page))
r = session.get(paging_url.format(page))
soup = BeautifulSoup(r.text, "html.parser")
items = soup.select('.iso-item.item-row-view.clearfix')
for item in items:
name = item.find('div', class_='col-md-10').get_text().strip().split('\xa0')[0]
toughness_element = item.find('div', class_='card-power-toughness')
try:
toughness = toughness_element.get_text().strip()
except:
toughness = None
cardtype = item.find('div', class_='cardtype').get_text()
card_dict = {
"name": name,
"toughness": toughness,
"cardtype": cardtype
}
return_list.append(card_dict)
if soup.select('li.next.hidden'): # this element only exists if the last page is reached
keep_paging = False
print("[*] Scraper is done. Quitting...")
else:
page += 1
# do stuff with your list of dicts - e.g. load it into pandas and save it to a spreadsheet
This will scroll until no more pages exists - no matter how many subpages would be in the site.
My point in the comment above was merely that if you encounter an Exception in your code, your pagecount would never increase. That's probably not what you want to do, which is why I recommended you to learn a little more about the behaviour of the whole try-except-else-finally deal.

I am also bluffed, by the request given the same reply, ignoring the page parameter. As a dirty soulution I can offer you first to set up the page-size to a high enough number to get all the Items that you want (this parameter works for some reason...)
import re
from math import ceil
import requests
from bs4 import BeautifulSoup as soup
InitUrl = Url = "https://mtgsingles.gr/search"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 2 # depth of pages to be retrieved
query = "dragon"
cardSet=set()
for i in range(1, NumOfPages):
page_html = requests.get(InitUrl,params={"page":i,"q":query,"page-size":999})
print(page_html.url)
page_soup = soup(page_html.text, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
cardString=card_name + "\n" + cardP_T + "\n" + cardType + "\n"
cardSet.add(cardString)
print(cardString)
NumOfCrawledPages += 1
print("Moving to page : " + str(NumOfCrawledPages + 1) + " with " +str(len(cards)) +"(cards)\n")

Not getting second page data using python Request

I am trying to get movie reviews from Fandango website. Even when I hit the URL for the second page onwards of movie reviews for a particular movie I keep getting the first page. Do I need to send cookies with requests?
Below is my code snippet:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
baseUrl = 'https://www.fandango.com/movie-reviews'
req = Request(baseUrl, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
# Getting all the movie links from the first page
movieLinks = soup.find_all("a", class_='dark')
# Get reviews for every movie
for i in range(2):#len(movieLinks)
try:
movieName = movieLinks[i].text.replace(' Review', '')
count = 1
print('\n\n****** ' + movieName + ' ********\n\n')
# Getting movie reviews from first 10
for j in range(3):
pageNum = j + 1;
movieReviewUrl = movieLinks[i]['href'] + '?pn=' + str(pageNum)
print('Hitting URL: ' + movieReviewUrl)
revReq = Request(movieReviewUrl, headers = {'User-Agent': 'Mozilla/5.0'})
revWebpage = urlopen(revReq).read()
revSoup = BeautifulSoup(revWebpage, 'html.parser')
revArr = revSoup.find_all("p", class_ = "fan-reviews__item-content")
for k in range(len(revArr)):
if len(revArr[k])>0:
print(str(count) + ' : ' + revArr[k].text)
count = count + 1
except:
print('Error for movie: ' + movieName)

I suggest using Requests, it's much easier to handle such requests with it.
from bs4 import BeautifulSoup
import requests
baseUrl = 'https://www.fandango.com/movie-reviews'
# req = Request(baseUrl, headers={'User-Agent': 'Mozilla/5.0'})
webpage = requests.get(baseUrl).text
soup = BeautifulSoup(webpage, 'html.parser')
# Getting all the movie links from the first page
movieLinks = soup.find_all("a", class_='dark')
# Get reviews for every movie
for i in range(2):#len(movieLinks)
try:
movieName = movieLinks[i].text.replace(' Review', '')
count = 1
print('\n\n****** ' + movieName + ' ********\n\n')
# Getting movie reviews from first 10
for j in range(3):
pageNum = j + 1;
movieReviewUrl = movieLinks[i]['href'] + '?pn=' + str(pageNum)
print('Hitting URL: ' + movieReviewUrl)
# revReq = Request(movieReviewUrl, headers = {'User-Agent': 'Mozilla/5.0'})
# revWebpage = urlopen(revReq).read()
revWebpage = requests.get(movieReviewUrl).text
revSoup = BeautifulSoup(revWebpage, 'html.parser')
revArr = revSoup.find_all("p", class_ = "fan-reviews__item-content")
print(len(revArr))
for k in range(len(revArr)):
if len(revArr[k])>0:
print(str(count) + ' : ' + revArr[k].text)
count = count + 1
except:
print('Error for movie: ' + movieName)
When you run it, you can see that revArr is returning 0, so please check "fan-reviews__item-content".

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

how to improve my web scraping code by using multithreading? - python

Related

how to export the Scraped data into excel with pre-defined header python?

How to update scraping link in for loop

I am using BeautifulSoup, how can I get the link after the redirect?

Web crawler not able to process more than one webpage

Not getting second page data using python Request

Categories

Resources