I am writing a parser but I have a problem. I understand that you can find many similar questions on the Internet, but they did not suit me. Therefore, I ask for help from you.I have little experience, so this question may not be very correct.
Code:
import requests
from bs4 import BeautifulSoup
URL = 'https://stopgame.ru/topgames'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'accept': '*/*'}
HOST = 'https://stopgame.ru'
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('a', class_='page')
if pagination:
return int(pagination[1].get.text())
else:
return 1
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_="lent-brief")
games = []
for item in items:
games.append({
"title": item.find("div", class_="title lent-title").get_text(strip=True),
"date": item.find("div", class_="game-date").get_text(strip=True),
"ganre": item.find("div", class_="game-genre").get_text(strip=True),
})
print(games)
print(len(games))
return games
def parse():
html = get_html(URL)
if html.status_code == 200:
pages_count = get_pages_count(html.text)
print(pages_count)
else:
print('Error')
parse()
Error:
File "D:/Python/parser1.py", line 45, in parse
pages_count = get_pages_count(html.text)
NameError: name 'get_pages_count' is not defined
Your function is named get_pages, but you're calling get_pages_count:
def get_pages(html):
.. but when attempting to call it:
pages_count = get_pages_count(html.text)
.. the call should be:
pages_count = get_pages(html.text)
In this below function the method you have called is wrong.
Instead of this pagination[1].get.text() it should be pagination[1].get_text() or
pagination[1].text
Code:
def get_pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('a', class_='page')
if pagination:
return int(pagination[1].get_text())
else:
return 1
OR
def get_pages(html):
soup = BeautifulSoup(html, 'html.parser')
pagination = soup.find_all('a', class_='page')
if pagination:
return int(pagination[1].text)
else:
return 1
Related
I am trying to multithread my scraper so it runs faster. Currently I commented pagination so it finished faster for time measurement but it runs the same as the simple scraper that I did not use concurrent.futures.ThreadPoolExecutor. Instead it after I try to quit the script from executing using Crtl+c it seems to quit one process, but immediately after quitting ing it seems to continue the same scraper, and I have to stop it from executing again, so something changes, but not the speed, nor the data.
This is my scraper:
from bs4 import BeautifulSoup
import requests
import concurrent.futures
NUM_THREADS = 30
BASEURL = 'https://www.motomoto.lt'
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
page = requests.get(BASEURL, headers=HEADERS)
soup = BeautifulSoup(page.content, 'html.parser')
item_list = []
def main():
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
executor.map(parse_category, soup)
def parse_category(soup):
for a in soup.find_all('a', class_='subcategory-name', href=True):
nexturl = BASEURL + a['href']
parse_subcategory(nexturl)
def parse_subcategory(url):
subcategoryPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(subcategoryPage.content, 'html.parser')
for a in soup.find_all('a', class_='subcategory-image', href=True):
nexturl= BASEURL + a['href']
parse_products(nexturl)
def parse_products(url):
productsPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(productsPage.content, 'html.parser')
for a in soup.find_all('a', class_='thumbnail product-thumbnail', href=True):
nexturl = a['href']
parse_item(nexturl)
# this = soup.find('a', attrs={'class':'next'}, href=True)
# if this is not None:
# nextpage = BASEURL + this['href']
# print('-' * 70)
# parse_products(nextpage)
def parse_item(url):
itemPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(itemPage.content, 'html.parser')
title = get_title(soup)
price = get_price(soup)
category = get_category(soup)
item = {
'Title': title,
'Price': price,
'Category': category
}
item_list.append(item)
print(item)
def get_title(soup):
title = soup.find('h1', class_='h1')
title_value = title.string
title_string = title_value.strip()
return title_string
def get_price(soup):
price = soup.find('span', attrs={'itemprop':'price'}).string.strip()
return price
def get_category(soup):
category = soup.find_all("li", attrs={'itemprop':'itemListElement'})[1].find('span', attrs={'itemprop':'name'}).getText()
return category
if __name__ == "__main__":
main()
Currently I am multithreading the first function, that uses the BS4 soup to gather the category links. How may I fix it to make it faster, even though it's using multiple functions?
The signature of ThreadPoolExecutor.map is
map(func, *iterables, timeout=None, chunksize=1)
The executor processes iterables concurrently.
If you have supplied multiple soups like executor.map(parse_category, [soup1, soup2, ...]) they will be processed in parallel. But since you have supplied only one soup, you are "doing one thing concurrently", which means there is no concurrency.
As you are calling parse_category only once, it worse not adding concurrency to it. Instead, you can parallelize parse_subcategory and parse_products like this:
...
def main():
executor = concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS)
parse_category(soup, executor)
def parse_category(soup, executor):
executor.map(
lambda url: parse_subcategory(url, executor),
[BASEURL + a['href'] for a in soup.find_all('a', class_='subcategory-name', href=True)])
def parse_subcategory(url, executor):
subcategoryPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(subcategoryPage.content, 'html.parser')
executor.map(
lambda url: parse_products(url, executor),
[BASEURL + a['href'] for a in soup.find_all('a', class_='subcategory-image', href=True)])
def parse_products(url, executor):
productsPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(productsPage.content, 'html.parser')
executor.map(
parse_item,
# here you missed the `BASEURL`, I kept it as-is
[a['href'] for a in soup.find_all('a', class_='thumbnail product-thumbnail', href=True)])
...
The remainder of the script is unchanged.
I didn't test it as the website seems inaccessible from my location. Reply if there's any bug.
I'm just learning python. I want to improve myself with examples. sorry for my English. I'm in the process of learning a new language. :)
The program pulls data from an e-commerce site.
when I want to save it as a csv file, each new data overwrites the previous data. I tried several examples but it didn't work.
Thanks for your help.
import requests
import gettext
from bs4 import BeautifulSoup
import pandas as pd
import openpyxl as xls
import xlsxwriter`
baseurl = "https://www.trendyol.com"
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41'
}
for x in range(1,62):
r = requests.get(f'https://www.trendyol.com/cep-telefonu-x-c103498?pi={x}', headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('div', class_='p-card-wrppr')
for item in productlist:
productname = item.find('span', class_='prdct-desc-cntnr-name').getText()
productprice_old = item.find('div', class_='prc-box-sllng').getText()
productprice_discount = item.find('div', class_='prc-box-dscntd')
for productlink in item.find_all('a'):
productlink = baseurl+productlink.get('href')
if productprice_discount == None:
productprice_discount = productprice_old
else:
productprice_discount = productprice_discount.getText()
for merchant_name in productlink:
r = requests.get(productlink, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
merchant_name = soup.find('a', class_='merchant-text')
if merchant_name == None:
merchant_name = soup.find('a', class_='title')
if merchant_name == None:
merchant_name = soup.find('span', class_='product-description-market-place')
if merchant_name == None:
merchant_name = ('NULL')
else:
merchant_name = merchant_name.getText()
break
for product_image in productlink:
r = requests.get(productlink, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
product_image = soup.find_all('img', attrs={'class':'detail-section-img'})
image_src = [x['src'] for x in product_image]
image_src = [x for x in image_src if x.endswith('.jpg' or '.png')]
break
data = [ [productname,productlink,productprice_old,productprice_discount,merchant_name,image_src] ]
df = pd.DataFrame(data, columns = ["Product Name", "URL", "Price", "D-Price", "Store", "Image Url"])
df.to_csv('trendyol3.csv')
You should add mode='a', which means append to append to file instead of rewriting:
df.to_csv('trendyol3.csv', mode='a')
I need to get text "Платонов А.П." Here's my code by far.
import requests
from bs4 import BeautifulSoup
from pip._internal.network.utils import HEADERS
URL = "https://www.moscowbooks.ru/books/?sortby=name&sortdown=false"
HEADERS = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15", "accept" : "*/*"}
HOST = "https://www.moscowbooks.ru"
def get_html(url, params=None):
r = requests.get(url, headers = HEADERS, params = params)
return r
def get_content(html):
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all("div", class_ = "catalog__item")
books = []
for item in items:
author_check = item.find("a", class_="author-name")
if author_check:
author = author_check.get_text()
else:
author_check = "Автор не указан"
books.append({
"title": item.find("div", class_ = "book-preview__title").get_text(strip=True),
"author": author_check,
"link": HOST + item.find("a", class_ = "book-preview__title-link").get("href"),
"cost": item.find("div", class_="book-preview__price").get_text(strip=True),
})
print(books)
print(len(books))
def parse():
html = get_html(URL)
if html.status_code == 200:
get_content(html.text)
else:
print("Error")
parse()
I get problems with author, because it get like this:
<a class="author-name" href="/catalog/author/21381/">Платонов А. П. </a>
Also need a little help with price because sometimes it gets '2\xa0274' instead of '2 274'.
The problem is that you define "author": author_check in your dictionary, while author_check = item.find("a", class_="author-name") and author = author_check.get_text(). You can change your for loop into something like this
for item in items:
author_check = item.find("a", class_="author-name")
if author_check:
author = author_check.text
else:
author = "Автор не указан"
For you issue with the display of the prices, you can just replace \xa0 with a comma or space.
"cost": item.find("div", class_="book-preview__price").get_text(strip=True).replace(u"\xa0", ",")
I've had to deal with similar problem. You can do the following:
author = author_check.get_text().split('>')[-2].split('<')[0]
You might have to substitute -2 with -1.
I'm trying to create a script using requests module (without using session) to parse two fields from a webpage but the script fails miserably. However, when I created another script using session, I could fetch the content from that site flawlessly.
Here goes the manual steps to reach the content:
Choose the first item from dropdown.
Get the links to the detail page.
Grab these two fields from detail page.
While creating the script using plain requests, I tried to make use of cookies but I ended up getting AttributeError.
Script without session:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def grab_first_link_from_dropdown(link):
r = requests.get(link,headers=headers)
soup = BeautifulSoup(r.text,"html.parser")
category_link = urljoin(base,soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link,headers=headers)
str_cookie = f"JSESSIONID={res.cookies['JSESSIONID']}"
soup = BeautifulSoup(res.text,"html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'",target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield str_cookie,inner_link
def get_content(str_cookie,inner_link):
headers['Cookie'] = str_cookie
res = requests.get(inner_link,headers=headers)
soup = BeautifulSoup(res.text,"html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: descripcion = ""
return expediente,descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie,detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie,detail_page_link))
What possible change should I bring about to make the script work?
There's a redirect that occurs on fetch_detail_page_link. Python Requests follows redirects by default. When your script obtains the cookies, it is only grabbing the cookies for the final request in the chain. You must access the history field of the response to see the redirects that were followed. Doing this with a Session object worked because it was preserving those cookies for you.
I must agree with others who have commented that it really would be a good idea to use a Session object for this. However if you insist on not using Session, your script would look like this:
import re
import requests
from requests.cookies import RequestsCookieJar
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': "Scraping Your Vigentes 1.0",
}
def grab_first_link_from_dropdown(link):
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
category_link = urljoin(base, soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link, headers=headers)
cookies = RequestsCookieJar() # create empty cookie jar
for r in res.history:
cookies.update(r.cookies) # merge in cookies from each redirect response
cookies.update(res.cookies) # merge in cookies from the final response
soup = BeautifulSoup(res.text, "html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'", target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield cookies, inner_link
def get_content(cookies, inner_link):
res = requests.get(inner_link, headers=headers, cookies=cookies)
if not res.ok:
print("Got bad response %s :(" % res.status_code)
return "", ""
soup = BeautifulSoup(res.text, "html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
descripcion = ""
return expediente, descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie, detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie, detail_page_link))
I have a function which two crawl the webpage and look for a particular class and find a href tag inside it.
url="https://www.poynter.org/ifcn-covid-19-misinformation/page/220/"
def url_parse(site):
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site,headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page)
return soup
def article_link(URL):
try:
soup=url_parse(URL)
for i in soup.find_all("a", class_="button entry-content__button entry-content__button--smaller"):
link=i['href']
except:
pass
return link
data['article_source']=""
for i, rows in data.iterrows():
rows['article_source']= article_link(rows['url'])
Issue
The function url_parse and article_link are working fine but when I use the function article_link to update the cell inside a datagram, it stops working after 1500 or 1000 URLs. I understand there could be an IP address with my laptop but I don't understand how to solve it because there is no error message.
Expectation
The function article_link parse all URL inside the data frame.
import requests
from bs4 import BeautifulSoup
from concurrent.futures.thread import ThreadPoolExecutor
url = "https://www.poynter.org/ifcn-covid-19-misinformation/page/{}/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
}
def main(url, num):
with requests.Session() as req:
print(f"Extracting Page# {num}")
r = req.get(url.format(num), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
links = [item.get("href") for item in soup.findAll(
"a", class_="button entry-content__button entry-content__button--smaller")]
return links
with ThreadPoolExecutor(max_workers=50) as executor:
futures = [executor.submit(main, url, num) for num in range(1, 238)]
for future in futures:
print(future.result())