Data Multithread Parsing

Data Multithread Parsing - python

I'm looking for an opportunity to optimize my Python code for a multithreaded Python application.
My code works in the following way: it downloads sitemap, gathering all links from it into map_links. After that, the Parser function checks on every link found and gathers data on the tag I need.
import threading
import requests
from bs4 import BeautifulSoup as bs
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
base_url = 'https://example.com/page.php'
sitemap_url = 'https://example.com/sitemap.xml' #https://exgfmeetworld.xyz/sitemap.xml
# ф-ция парсинга карты сайта
def sitemap(sitemap_url,headers):
map_links =[]
session = requests.Session()
request =session.get(sitemap_url,headers=headers)
if request.status_code == 200:
soup=bs(request.content, 'xml')
for links in soup.find_all('loc'):
map_links.append(links.text)
return map_links
# главная ф-ция парсинга
def parser(base_url,headers):
session = requests.Session()
request =session.get(base_url,headers=headers)
if request.status_code == 200:
soup=bs(request.content, 'html.parser')
#keyword = soup.find_all('h1', attrs={'class':'vedaky'})
keyword = soup.select('h1')[0].get_text()
else:
print ('error')
pass
return keyword
# главная функция парсинга
def main():
all_links=sitemap (sitemap_url,headers)
for i in all_links:
keyword_pars = parser(i,headers)
print (keyword_pars)
if _name_ == '__main__':
main()
I have tried "multiprocessing import Pool" but it doesn't work for my purpose. I need non-pool decisions because I need much higher performance from the script. I'm planning to use it in more than 20 threads.

I can't test it without the proper links but I think this will do what you want it to. It works by passing a list to the parser function, which is of course passed by reference, then the output is 'saved' to an index of the list.
Note that I haven't added any much needed error handling.
import threading
import requests
from bs4 import BeautifulSoup as bs
from multiprocessing.dummy import Pool as ThreadPool
SITE_MAP_URL = 'https://exgfmeetworld.xyz/sitemap.xml'
BASE_URL = 'https://example.com/page.php'
HEADERS = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
def get_site_map() -> list:
request = requests.get(SITE_MAP_URL, headers=HEADERS, timeout=5)
links = []
if request.status_code == 200:
soup = bs(request.content, "html.parser")
links = [l.text for l in soup.find_all("loc")]
return links
def parser(link: str):
request = requests.get(link, headers=HEADERS, timeout=5)
if request.status_code == 200:
soup = bs(request.content, "html.parser")
return soup.find("h1").text
return None
# - MAIN
links = get_site_map()
parser_output = []
pool = ThreadPool(20)
results = pool.map(parser, links)
pool.close()
pool.join()
print(results)

Related

How to get src in an image using class?

Hi I am trying to get the src data from the image on the website, I locate the image using the class since it is unique. With the code below it is able to locate the image but is unable to save the image to mongodb and shows up as null, so want to find the src and save the link instead.
ps. the code works for other classes but not sure how to locate the src and save it into "findImage".
https://myaeon2go.com/products/category/6236298/vegetable
postal code is : 56000
cate_list = [
"https://myaeon2go.com/products/category/1208101/fresh-foods",
"https://myaeon2go.com/products/category/8630656/ready-to-eat",
"https://myaeon2go.com/products/category/6528959/grocery",
"https://myaeon2go.com/products/category/6758871/snacks",
"https://myaeon2go.com/products/category/8124135/chill-&-frozen",
"https://myaeon2go.com/products/category/4995043/beverage",
"https://myaeon2go.com/products/category/3405538/household",
"https://myaeon2go.com/products/category/493239/baby-&-kids",
]
cookies = {
"hideLocationOverlay": "true",
"selectedShippingState": "Kuala Lumpur",
"selectedPostalCode": "56000",
}
for x in range(len(cate_list)):
url = cate_list[x]
# convert soup to readable html
result = requests.get(url, cookies=cookies)
doc = BeautifulSoup(result.text, "html.parser")
# a for loop located here to loop through all the products
# <span class="n_MyDQk4X3P0XRRoTnOe a8H5VCTgYjZnRCen1YkC">myAEON2go Signature Taman Maluri</span>
findImage = j.find("img", {"class": "pgJEkulRiYnxQNzO8njV shown"})

To extract the value of src attribute simply call .get('src') on your element.
Try to change your strategy selecting elements and avoid using classes that are often dynamically - I recommend to use more static identifier as well as HTML structure.
for url in cate_list:
result = requests.get(url, cookies=cookies,headers = {'User-Agent': 'Mozilla/5.0'})
doc = BeautifulSoup(result.text, "html.parser")
for e in doc.select('.g-product-list li'):
print(e.img.get('src'))
Note: Iterating your list do not need range(len()) construct
Example
import requests
from bs4 import BeautifulSoup
cate_list = [
"https://myaeon2go.com/products/category/1208101/fresh-foods",
"https://myaeon2go.com/products/category/8630656/ready-to-eat",
"https://myaeon2go.com/products/category/6528959/grocery",
"https://myaeon2go.com/products/category/6758871/snacks",
"https://myaeon2go.com/products/category/8124135/chill-&-frozen",
"https://myaeon2go.com/products/category/4995043/beverage",
"https://myaeon2go.com/products/category/3405538/household",
"https://myaeon2go.com/products/category/493239/baby-&-kids",
]
cookies = {
"hideLocationOverlay": "true",
"selectedShippingState": "Kuala Lumpur",
"selectedPostalCode": "56000",
}
for url in cate_list:
result = requests.get(url, cookies=cookies,headers = {'User-Agent': 'Mozilla/5.0'})
doc = BeautifulSoup(result.text, "html.parser")
for e in doc.select('.g-product-list li'):
print(e.img.get('src').split(')/')[-1])
Output
https://assets.myboxed.com.my/1659400060229.jpg
https://assets.myboxed.com.my/1662502067580.jpg
https://assets.myboxed.com.my/1658448744726.jpg
https://assets.myboxed.com.my/1627880003755.jpg
https://assets.myboxed.com.my/1662507451284.jpg
https://assets.myboxed.com.my/1662501936757.jpg
https://assets.myboxed.com.my/1659400602324.jpg
https://assets.myboxed.com.my/1627880346297.jpg
https://assets.myboxed.com.my/1662501743853.jpg
...

import requests
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed
cookies = {
"hideLocationOverlay": "true",
"selectedShippingState": "Kuala Lumpur",
"selectedPostalCode": "56000",
}
links = [
"8630656/ready-to-eat",
"1208101/fresh-foods",
"6528959/grocery",
"6758871/snacks",
"8124135/chill-&-frozen",
"4995043/beverage",
"3405538/household",
"493239/baby-&-kids",
]
allin = []
def get_soup(content):
return BeautifulSoup(content, 'lxml', parse_only=SoupStrainer('img', class_="pgJEkulRiYnxQNzO8njV"))
def worker(req, url, link):
r = req.get(url + link)
soup = get_soup(r.content)
return [urljoin(url, x['src']) for x in soup.select('img')]
def main(url):
with requests.Session() as req, ThreadPoolExecutor(max_workers=10) as executor:
req.cookies.update(cookies)
fs = (executor.submit(worker, req, url, link) for link in links)
for f in as_completed(fs):
allin.extend(f.result())
print(allin)
if __name__ == "__main__":
main('https://myaeon2go.com/products/category/')

Multi-Threading BS4 scraper does not speed up the process

I am trying to multithread my scraper so it runs faster. Currently I commented pagination so it finished faster for time measurement but it runs the same as the simple scraper that I did not use concurrent.futures.ThreadPoolExecutor. Instead it after I try to quit the script from executing using Crtl+c it seems to quit one process, but immediately after quitting ing it seems to continue the same scraper, and I have to stop it from executing again, so something changes, but not the speed, nor the data.
This is my scraper:
from bs4 import BeautifulSoup
import requests
import concurrent.futures
NUM_THREADS = 30
BASEURL = 'https://www.motomoto.lt'
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
page = requests.get(BASEURL, headers=HEADERS)
soup = BeautifulSoup(page.content, 'html.parser')
item_list = []
def main():
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
executor.map(parse_category, soup)
def parse_category(soup):
for a in soup.find_all('a', class_='subcategory-name', href=True):
nexturl = BASEURL + a['href']
parse_subcategory(nexturl)
def parse_subcategory(url):
subcategoryPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(subcategoryPage.content, 'html.parser')
for a in soup.find_all('a', class_='subcategory-image', href=True):
nexturl= BASEURL + a['href']
parse_products(nexturl)
def parse_products(url):
productsPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(productsPage.content, 'html.parser')
for a in soup.find_all('a', class_='thumbnail product-thumbnail', href=True):
nexturl = a['href']
parse_item(nexturl)
# this = soup.find('a', attrs={'class':'next'}, href=True)
# if this is not None:
# nextpage = BASEURL + this['href']
# print('-' * 70)
# parse_products(nextpage)
def parse_item(url):
itemPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(itemPage.content, 'html.parser')
title = get_title(soup)
price = get_price(soup)
category = get_category(soup)
item = {
'Title': title,
'Price': price,
'Category': category
}
item_list.append(item)
print(item)
def get_title(soup):
title = soup.find('h1', class_='h1')
title_value = title.string
title_string = title_value.strip()
return title_string
def get_price(soup):
price = soup.find('span', attrs={'itemprop':'price'}).string.strip()
return price
def get_category(soup):
category = soup.find_all("li", attrs={'itemprop':'itemListElement'})[1].find('span', attrs={'itemprop':'name'}).getText()
return category
if __name__ == "__main__":
main()
Currently I am multithreading the first function, that uses the BS4 soup to gather the category links. How may I fix it to make it faster, even though it's using multiple functions?

The signature of ThreadPoolExecutor.map is
map(func, *iterables, timeout=None, chunksize=1)
The executor processes iterables concurrently.
If you have supplied multiple soups like executor.map(parse_category, [soup1, soup2, ...]) they will be processed in parallel. But since you have supplied only one soup, you are "doing one thing concurrently", which means there is no concurrency.
As you are calling parse_category only once, it worse not adding concurrency to it. Instead, you can parallelize parse_subcategory and parse_products like this:
...
def main():
executor = concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS)
parse_category(soup, executor)
def parse_category(soup, executor):
executor.map(
lambda url: parse_subcategory(url, executor),
[BASEURL + a['href'] for a in soup.find_all('a', class_='subcategory-name', href=True)])
def parse_subcategory(url, executor):
subcategoryPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(subcategoryPage.content, 'html.parser')
executor.map(
lambda url: parse_products(url, executor),
[BASEURL + a['href'] for a in soup.find_all('a', class_='subcategory-image', href=True)])
def parse_products(url, executor):
productsPage = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(productsPage.content, 'html.parser')
executor.map(
parse_item,
# here you missed the `BASEURL`, I kept it as-is
[a['href'] for a in soup.find_all('a', class_='thumbnail product-thumbnail', href=True)])
...
The remainder of the script is unchanged.
I didn't test it as the website seems inaccessible from my location. Reply if there's any bug.

Failed to parse content from a webpage using requests

I'm trying to create a script using requests module (without using session) to parse two fields from a webpage but the script fails miserably. However, when I created another script using session, I could fetch the content from that site flawlessly.
Here goes the manual steps to reach the content:
Choose the first item from dropdown.
Get the links to the detail page.
Grab these two fields from detail page.
While creating the script using plain requests, I tried to make use of cookies but I ended up getting AttributeError.
Script without session:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def grab_first_link_from_dropdown(link):
r = requests.get(link,headers=headers)
soup = BeautifulSoup(r.text,"html.parser")
category_link = urljoin(base,soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link,headers=headers)
str_cookie = f"JSESSIONID={res.cookies['JSESSIONID']}"
soup = BeautifulSoup(res.text,"html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'",target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield str_cookie,inner_link
def get_content(str_cookie,inner_link):
headers['Cookie'] = str_cookie
res = requests.get(inner_link,headers=headers)
soup = BeautifulSoup(res.text,"html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: descripcion = ""
return expediente,descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie,detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie,detail_page_link))
What possible change should I bring about to make the script work?

There's a redirect that occurs on fetch_detail_page_link. Python Requests follows redirects by default. When your script obtains the cookies, it is only grabbing the cookies for the final request in the chain. You must access the history field of the response to see the redirects that were followed. Doing this with a Session object worked because it was preserving those cookies for you.
I must agree with others who have commented that it really would be a good idea to use a Session object for this. However if you insist on not using Session, your script would look like this:
import re
import requests
from requests.cookies import RequestsCookieJar
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': "Scraping Your Vigentes 1.0",
}
def grab_first_link_from_dropdown(link):
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
category_link = urljoin(base, soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link, headers=headers)
cookies = RequestsCookieJar() # create empty cookie jar
for r in res.history:
cookies.update(r.cookies) # merge in cookies from each redirect response
cookies.update(res.cookies) # merge in cookies from the final response
soup = BeautifulSoup(res.text, "html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'", target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield cookies, inner_link
def get_content(cookies, inner_link):
res = requests.get(inner_link, headers=headers, cookies=cookies)
if not res.ok:
print("Got bad response %s :(" % res.status_code)
return "", ""
soup = BeautifulSoup(res.text, "html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
descripcion = ""
return expediente, descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie, detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie, detail_page_link))

Script fails to generate results

I've written a script in python to scrape the result populated upon filling in two inputboxes zipcode and distance with 66109,10000. When I try the inputs manually, the site does display results but when I try the same using the script I get nothing. The script throws no error either. What might be the issues here?
Website link
I've tried with:
import requests
from bs4 import BeautifulSoup
url = 'https://www.sart.org/clinic-pages/find-a-clinic/'
payload = {
'zip': '66109',
'strdistance': '10000',
'SelectedState': 'Select State or Region'
}
def get_clinics(link):
session = requests.Session()
response = session.post(link,data=payload,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(response.text,"lxml")
item = soup.select_one(".clinics__search-meta").text
print(item)
if __name__ == '__main__':
get_clinics(url)
I'm only after this line Within 10000 miles of 66109 there are 383 clinics. generated when the search is made.

I changed the url and the requests method to GET and worked for me
def get_clinics(link):
session = requests.Session()
response = session.get(link, headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(response.text,"lxml")
item = soup.select_one(".clinics__search-meta").text
print(item)
url = 'https://www.sart.org/clinic-pages/find-a-clinic?zip=66109&strdistance=10000&SelectedState=Select+State+or+Region'
get_clinics(url)

Include cookies is one of the main concern here. If you do it in the right way, you can get a valid response following the way you started. Here is the working code:
import requests
from bs4 import BeautifulSoup
url = 'https://www.sart.org/clinic-pages/find-a-clinic/'
payload = {
'zip': '66109',
'strdistance': '10000',
'SelectedState': 'Select State or Region'
}
def get_clinics(link):
with requests.Session() as s:
res = s.get(link)
req = s.post(link,data=payload,cookies=res.cookies.get_dict())
soup = BeautifulSoup(req.text,"lxml")
item = soup.select_one(".clinics__search-meta").get_text(strip=True)
print(item)
if __name__ == '__main__':
get_clinics(url)

requests failes to keep logged in session

I am trying to scrape some emails from mdpi.com, emails available only to logged in users. But it fails when I am trying to do so. I am getting
when logged out:
Code itself:
import requests
from bs4 import BeautifulSoup
import traceback
login_data = {'form[email]': 'xxxxxxx#gmail.com', 'form[password]': 'xxxxxxxxx', 'remember': 1,}
base_url = 'http://www.mdpi.com'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0'}
session = requests.Session()
session.headers = headers
# log_in
s = session.post('https://susy.mdpi.com/user/login', data=login_data)
print(s.text)
print(session.cookies)
def make_soup(url):
try:
r = session.get(url)
soup = BeautifulSoup(r.content, 'lxml')
return soup
except:
traceback.print_exc()
return None
example_link = 'http://www.mdpi.com/search?journal=medsci&year_from=1996&year_to=2017&page_count=200&sort=relevance&view=default'
def article_finder(soup):
one_page_articles_divs = soup.find_all('div', class_='article-content')
for article_div in one_page_articles_divs:
a_link = article_div.find('a', class_='title-link')
link = base_url + a_link.get('href')
print(link)
article_soup = make_soup(link)
grab_author_info(article_soup)
def grab_author_info(article_soup):
# title of the article
article_title = article_soup.find('h1', class_="title").text
print(article_title)
# affiliation
affiliations_div = article_soup.find('div', class_='art-affiliations')
affiliation_dict = {}
aff_indexes = affiliations_div.find_all('div', class_='affiliation-item')
aff_values = affiliations_div.find_all('div', class_='affiliation-name')
for i, index in enumerate(aff_indexes): # 0, 1
affiliation_dict[int(index.text)] = aff_values[i].text
# authors names
authors_div = article_soup.find('div', class_='art-authors')
authors_spans = authors_div.find_all('span', class_='inlineblock')
for span in authors_spans:
name_and_email = span.find_all('a') # name and email
name = name_and_email[0].text
# email
email = name_and_email[1].get('href')[7:]
# affiliation_index
affiliation_index = span.find('sup').text
indexes = set()
if len(affiliation_index) > 2:
for i in affiliation_index.strip():
try:
ind = int(i)
indexes.add(ind)
except ValueError:
pass
print(name)
for index in indexes:
print('affiliation =>', affiliation_dict[index])
print('email: {}'.format(email))
if __name__ == '__main__':
article_finder(make_soup(example_link))
What should I do in order to get what I want?

Ah that is easy, you haven't managed to log in correctly. If you look at the response from your initial call you will see that you are returned the login page HTML instead of the my profile page. The reason for this is that you are not submitted the hidden token on the form.
The solution request the login page, and then use either lxml or BeautifulSoup to parse the hidden input 'form[_token]'. Get that value and then add it to your login_data payload.
Then submit your login request and you'll be in.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Data Multithread Parsing - python

Related

How to get src in an image using class?

Multi-Threading BS4 scraper does not speed up the process

Failed to parse content from a webpage using requests

Script fails to generate results

requests failes to keep logged in session

Categories

Resources