I am trying to multithread my scraper so it runs faster. Currently I commented pagination so it finished faster for time measurement but it runs the same as the simple scraper that I did not use concurrent.futures.ThreadPoolExecutor. Instead it after I try to quit the script from executing using Crtl+c it seems to quit one process, but immediately after quitting ing it seems to continue the same scraper, and I have to stop it from executing again, so something changes, but not the speed, nor the data.
This is my scraper:
from bs4 import BeautifulSoup
import requests
import concurrent.futures
NUM_THREADS = 30
BASEURL = 'https://www.motomoto.lt'
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
page = requests.get(BASEURL, headers=HEADERS)
soup = BeautifulSoup(page.content, 'html.parser')
item_list = []
def main():
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
executor.map(parse_category, soup)
def parse_category(soup):
for a in soup.find_all('a', class_='subcategory-name', href=True):
nexturl = BASEURL + a['href']
parse_subcategory(nexturl)
def parse_subcategory(url):
subcategoryPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(subcategoryPage.content, 'html.parser')
for a in soup.find_all('a', class_='subcategory-image', href=True):
nexturl= BASEURL + a['href']
parse_products(nexturl)
def parse_products(url):
productsPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(productsPage.content, 'html.parser')
for a in soup.find_all('a', class_='thumbnail product-thumbnail', href=True):
nexturl = a['href']
parse_item(nexturl)
# this = soup.find('a', attrs={'class':'next'}, href=True)
# if this is not None:
# nextpage = BASEURL + this['href']
# print('-' * 70)
# parse_products(nextpage)
def parse_item(url):
itemPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(itemPage.content, 'html.parser')
title = get_title(soup)
price = get_price(soup)
category = get_category(soup)
item = {
'Title': title,
'Price': price,
'Category': category
}
item_list.append(item)
print(item)
def get_title(soup):
title = soup.find('h1', class_='h1')
title_value = title.string
title_string = title_value.strip()
return title_string
def get_price(soup):
price = soup.find('span', attrs={'itemprop':'price'}).string.strip()
return price
def get_category(soup):
category = soup.find_all("li", attrs={'itemprop':'itemListElement'})[1].find('span', attrs={'itemprop':'name'}).getText()
return category
if __name__ == "__main__":
main()
Currently I am multithreading the first function, that uses the BS4 soup to gather the category links. How may I fix it to make it faster, even though it's using multiple functions?
The signature of ThreadPoolExecutor.map is
map(func, *iterables, timeout=None, chunksize=1)
The executor processes iterables concurrently.
If you have supplied multiple soups like executor.map(parse_category, [soup1, soup2, ...]) they will be processed in parallel. But since you have supplied only one soup, you are "doing one thing concurrently", which means there is no concurrency.
As you are calling parse_category only once, it worse not adding concurrency to it. Instead, you can parallelize parse_subcategory and parse_products like this:
...
def main():
executor = concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS)
parse_category(soup, executor)
def parse_category(soup, executor):
executor.map(
lambda url: parse_subcategory(url, executor),
[BASEURL + a['href'] for a in soup.find_all('a', class_='subcategory-name', href=True)])
def parse_subcategory(url, executor):
subcategoryPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(subcategoryPage.content, 'html.parser')
executor.map(
lambda url: parse_products(url, executor),
[BASEURL + a['href'] for a in soup.find_all('a', class_='subcategory-image', href=True)])
def parse_products(url, executor):
productsPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(productsPage.content, 'html.parser')
executor.map(
parse_item,
# here you missed the `BASEURL`, I kept it as-is
[a['href'] for a in soup.find_all('a', class_='thumbnail product-thumbnail', href=True)])
...
The remainder of the script is unchanged.
I didn't test it as the website seems inaccessible from my location. Reply if there's any bug.
Related
Hello fellow coders :)
So as part of my research project I need to scrape data out of a website.
Obviously it detects bots therefore I am trying to implement proxies on a loop I know works (getting the brands url):
The working loop:
brands_links= []
for country_link in country_links:
r = requests.get(url + country_link, headers=headers)
soup_b = BeautifulSoup(r.text, "lxml")
for link in soup_b.find_all("div", class_='designerlist cell small-6 large-4'):
for link in link.find_all('a'):
durl = link.get('href')
brands_links.append(durl)
The loop using proxies:
brands_links= []
i = 0
while i in range(0, len(country_links)):
print(i)
try:
proxy_index = random.randint(0, len(proxies) - 1)
proxy = {"http": proxies[proxy_index], "https": proxies[proxy_index]}
r = requests.get(url + country_links[i], headers=headers, proxies=proxy, timeout=10)
soup_b = BeautifulSoup(r.text, "lxml")
for link in soup_b.find_all("div", class_='designerlist cell small-6 large-4'):
for link in link.find_all('a'):
durl = link.get('href')
brands_links.append(durl)
if durl is not None :
print("scraping happening")
i += 1
else:
continue
except:
print("proxy not working")
proxies.remove(proxies[proxy_index])
if i == len(country_links):
break
else:
continue
Unfortunately it does not scrape all the links.
With the working loop only using headers I get a list of lenght 3788. With this one I only get 2387.
By inspecting the data I can see it skips some country links hence the difference in length.
I am trying to force the loop to scrape all the links with the "if" statement but it does not seem to work.
Anyone knows what I am doing wrong or got an idea which would make it scrape everything?
Thanks in advances
Thanks for sharing the link.
You said:
Obviously it detects bots therefore I am trying to implement
proxies...
What makes you think this? Here is some code I came up with, which seems to scrape all the divs, as far as I can tell:
def main():
import requests
from bs4 import BeautifulSoup
countries = (
("United States", "United+States.html"),
("Canada", "Canada.html"),
("United Kingdom", "United+Kingdom.html")
)
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
}
for country, document in countries:
url = f"https://www.fragrantica.com/country/{document}"
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
divs = soup.find_all("div", {"class": "designerlist"})
print(f"Number of divs in {country}: {len(divs)}")
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
Output:
Number of divs in United States: 1016
Number of divs in Canada: 40
Number of divs in United Kingdom: 308
>>>
So I found a way to for the loop to keep the scraping until it actually scrapes the link.
Here's the updated code:
brands_links= []
i = 0
while i in range(0, len(country_links)):
print(i)
try:
proxy_index = random.randint(0, len(proxies) - 1)
proxy = {"http": proxies[proxy_index], "https": proxies[proxy_index]}
r = requests.get(url + country_links[i], headers=headers, proxies=proxy, timeout=10)
soup_b = BeautifulSoup(r.text, "lxml")
for link in soup_b.find_all("div", class_='designerlist cell small-6 large-4'):
for link in link.find_all('a'):
durl = link.get('href')
brands_links.append(durl)
except:
print("proxy not working")
proxies.remove(proxies[proxy_index])
continue
try :
durl
except NameError:
print("scraping not happening")
continue
else:
print("scraping happening")
del durl
i += 1
if i == len(country_links):
break
else:
continue
So it is the last if statement which checks if the link was actually scraped.
I am not really familiar with functions. So if anyone has a way to make it simpler or more efficient I would highly appreciate. As for now I will be using #Paul M function to improve my loop or tranform it into a function.
I am writing a parser but I have a problem. I understand that you can find many similar questions on the Internet, but they did not suit me. Therefore, I ask for help from you.I have little experience, so this question may not be very correct.
Code:
import requests
from bs4 import BeautifulSoup
URL = 'https://stopgame.ru/topgames'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'accept': '*/*'}
HOST = 'https://stopgame.ru'
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('a', class_='page')
if pagination:
return int(pagination[1].get.text())
else:
return 1
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_="lent-brief")
games = []
for item in items:
games.append({
"title": item.find("div", class_="title lent-title").get_text(strip=True),
"date": item.find("div", class_="game-date").get_text(strip=True),
"ganre": item.find("div", class_="game-genre").get_text(strip=True),
})
print(games)
print(len(games))
return games
def parse():
html = get_html(URL)
if html.status_code == 200:
pages_count = get_pages_count(html.text)
print(pages_count)
else:
print('Error')
parse()
Error:
File "D:/Python/parser1.py", line 45, in parse
pages_count = get_pages_count(html.text)
NameError: name 'get_pages_count' is not defined
Your function is named get_pages, but you're calling get_pages_count:
def get_pages(html):
.. but when attempting to call it:
pages_count = get_pages_count(html.text)
.. the call should be:
pages_count = get_pages(html.text)
In this below function the method you have called is wrong.
Instead of this pagination[1].get.text() it should be pagination[1].get_text() or
pagination[1].text
Code:
def get_pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('a', class_='page')
if pagination:
return int(pagination[1].get_text())
else:
return 1
OR
def get_pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('a', class_='page')
if pagination:
return int(pagination[1].text)
else:
return 1
I have a function which two crawl the webpage and look for a particular class and find a href tag inside it.
url="https://www.poynter.org/ifcn-covid-19-misinformation/page/220/"
def url_parse(site):
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site,headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page)
return soup
def article_link(URL):
try:
soup=url_parse(URL)
for i in soup.find_all("a", class_="button entry-content__button entry-content__button--smaller"):
link=i['href']
except:
pass
return link
data['article_source']=""
for i, rows in data.iterrows():
rows['article_source']= article_link(rows['url'])
Issue
The function url_parse and article_link are working fine but when I use the function article_link to update the cell inside a datagram, it stops working after 1500 or 1000 URLs. I understand there could be an IP address with my laptop but I don't understand how to solve it because there is no error message.
Expectation
The function article_link parse all URL inside the data frame.
import requests
from bs4 import BeautifulSoup
from concurrent.futures.thread import ThreadPoolExecutor
url = "https://www.poynter.org/ifcn-covid-19-misinformation/page/{}/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
}
def main(url, num):
with requests.Session() as req:
print(f"Extracting Page# {num}")
r = req.get(url.format(num), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
links = [item.get("href") for item in soup.findAll(
"a", class_="button entry-content__button entry-content__button--smaller")]
return links
with ThreadPoolExecutor(max_workers=50) as executor:
futures = [executor.submit(main, url, num) for num in range(1, 238)]
for future in futures:
print(future.result())
I want to scrape few pages from amazon website like title,url,aisn and i run into a problem that script only parsing 15 products while on the page it is showing 50. i decided to print out all html to console and i saw that the html is ending at 15 products without any errors from the script.
Here is the part of my script
keyword = "men jeans".replace(' ', '+')
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5'}
url = "https://www.amazon.com/s/field-keywords={}".format(keyword)
request = requests.session()
req = request.get(url, headers = headers)
sleep(3)
soup = BeautifulSoup(req.content, 'html.parser')
print(soup)
It's because few of the items are generated dynamically. There might be any better solution other than using selenium. However, as a workaround you can try the below way instead.
from selenium import webdriver
from bs4 import BeautifulSoup
def fetch_item(driver,keyword):
driver.get(url.format(keyword.replace(" ", "+")))
soup = BeautifulSoup(driver.page_source, 'html.parser')
for items in soup.select("[id^='result_']"):
try:
name = items.select_one("h2").text
except AttributeError: name = ""
print(name)
if __name__ == '__main__':
url = "https://www.amazon.com/s/field-keywords={}"
driver = webdriver.Chrome()
try:
fetch_item(driver,"men jeans")
finally:
driver.quit()
Upon running the above script you should get 56 names or something as result.
import requests
from bs4 import BeautifulSoup
for page in range(1, 21):
keyword = "red car".replace(' ', '+')
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5'}
url = "https://www.amazon.com/s/field-keywords=" + keyword + "?page=" + str(page)
request = requests.session()
req = request.get(url, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
results = soup.findAll("li", {"class": "s-result-item"})
for i in results:
try:
print(i.find("h2", {"class": "s-access-title"}).text.replace('[SPONSORED]', ''))
print(i.find("span", {"class": "sx-price-large"}).text.replace("\n", ' '))
print('*' * 20)
except:
pass
Amazon's page range is max till 20 here is it crawling the pages
So I have this code that will give me the urls I need in a list format
import requests
from bs4 import BeautifulSoup
offset = 0
links = []
with requests.Session() as session:
while True:
r = session.get("http://rayleighev.deviantart.com/gallery/44021661/Reddit?offset=%d" % offset)
soup = BeautifulSoup(r.content, "html.parser")
new_links = soup.find_all("a", {'class' : "thumb"})
# no more links - break the loop
if not new_links:
break
# denotes the number of gallery pages gone through at one time (# of pages times 24 equals the number below)
links.extend(new_links)
print(len(links))
offset += 24
#denotes the number of gallery pages(# of pages times 24 equals the number below)
if offset == 48:
break
for link in links:
print(link.get("href"))
After that I try to get different text from all of the urls, and all that text is in relatively the same place on each one. But, whenever I run the second half, below, I keep getting a chunk of html text and some errors, and I'm not sure of how to fix it or if there is any other, and preferably simpler, way to get the text from each url.
import urllib.request
import re
for link in links:
url = print("%s" % link)
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
paragraphs = re.findall(r'</a><br /><br />(.*?)</div>', str(respData))
if paragraphs != None:
paragraphs = re.findall(r'<br /><br />(.*?)</span>', str(respData))
if paragraphs != None:
paragraphs = re.findall(r'<br /><br />(.*?)</span></div>', str(respData))
for eachP in paragraphs:
print(eachP)
title = re.findall(r'<title>(.*?)</title>', str(respData))
for eachT in title:
print(eachT)
Your code:
for link in links:
url = print("%s" % link)
assigns None to url. Perhaps you mean:
for link in links:
url = "%s" % link.get("href")
There's also no reason to use urllib to get the sites content, you can use requests as you did before by changing:
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
to
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.content, "html.parser")
Now you can get the title and paragraph with just:
title = soup.find('div', {'class': 'dev-title-container'}).h1.text
paragraph = soup.find('div', {'class': 'text block'}).text