Python BeautifulSoup... checking if the keyword is in the soup - python

I am making a code that retrieves data from an E-journal site.
What I want to get are the titles, pages, authors, and abstracts of the articles.
I succeed retrieving the data and now making a list that combines them along with articles.
Some articles don't have authors or abstracts so I used if in def article(): to classify them. But it doesn't work, showing the results that in else code. please help me...
(I'm not a native English speaker so I hope you understand what I want to say...)
import requests
from bs4 import BeautifulSoup
h = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
URL = "https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7"
JAGS_result = requests.get(URL, headers=h)
JAGS_soup = BeautifulSoup(JAGS_result.text, "html.parser")
T = []
for title in JAGS_soup.select("a > h2"):
T.append(title.text)
P = []
for page in JAGS_soup.select(".page-range"):
P.append(page.text)
A =[]
for author in JAGS_soup.select(".comma__list"):
A.append(author.text)
L = []
for link in JAGS_soup.find_all('a',{"title":"Abstract"}):
L.append(link.get('href'))
Ab_Links = []
a = 0
for ab_link in L:
full_link = "https://agsjournals.onlinelibrary.wiley.com"+L[a]
Ab_Links.append(full_link)
a = a+1
b = 0
Ab = []
Ab_URL = Ab_Links[b]
for ab_url in Ab_Links:
Ab_result = requests.get(Ab_Links[b], headers = h)
Ab_soup = BeautifulSoup(Ab_result.text, "html.parser")
abstract = Ab_soup.find(class_='article-section article-section__abstract').text
Ab.append(abstract)
b = b+1
result = JAGS_soup.find_all("div", {"class": "issue-item"})
def article():
x = 0
results = []
for y in list(range(len(T))) :
an_article = [T[x], P[x]]
if "author" in result[x]:
an_article.append(A[x])
else :
an_article.append(" ")
if "Abstract" in result[x]:
an_article.append(Ab[x])
else:
an_article.append("No Abstract available")
results.append(an_article)
x = x+1
return results
print(article())

I believe I have written some code that does what you hope to accomplish. I don't generally like making a list of all components and then trying to assemble each article afterwards. Instead, I got the topic, title, authors, pages, and abstract link of each article and just made the article object there.
Here is the code:
import bs4
import requests
print(bs4.__version__)
print(requests.__version__)
headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
res = requests.get("https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7",headers=headers)
soup = bs4.BeautifulSoup(res.content,"html.parser")
# gets the info on the main issue
parent_volume_tag = soup.find('li', class_="grid-item cover-image")
volume_and_issue = parent_volume_tag.find('div', class_="cover-image__parent-item").text.replace('\n', '')
page_nums = parent_volume_tag.find('div', class_="cover-image__pageRange").text.replace('\n', '')
issue_date = parent_volume_tag.find('div', class_="cover-image__date").text.replace('\n', '')
volume_info = [volume_and_issue, page_nums, issue_date] # creates a list with the relevant volume/issue data
def get_catergory(container):
return container.find('h3').text
def get_issue_list(container):
issues = []
for issue in container.find_all('div', class_="issue-item"):
if not issue.find('div', class_='issue-item'):
issues.append(issue)
return issues
def get_item_title(issue):
return issue.find('a')
def get_item_authors(issue):
author_list = issue.find('div', class_="comma__list")
if author_list:
authors = [i.text for i in author_list.find_all('span', class_="comma__item")]
authors = [i.replace('\n ', '').replace(', ', '') for i in authors]
return authors
else:
return None
def get_abstract_link(issue):
abstract_tag = issue.find('a', title="Abstract")
if abstract_tag:
link = "https://agsjournals.onlinelibrary.wiley.com" + abstract_tag.get('href')
return link
else:
return None
containers = soup.find_all('div', class_="issue-items-container bulkDownloadWrapper")
all = []
for container in containers:
topic = get_catergory(container)
issues = get_issue_list(container)
catergory = []
for issue in issues:
title = issue.find('h2').text
authors = get_item_authors(issue)
page_range = issue.find('li', class_='page-range').text.replace('Pages: ', '')
abstract_link = get_abstract_link(issue)
article = [topic, title, authors, page_range, abstract_link]
catergory.append(article)
all.append(catergory)
Within the all list, you have lists of articles, sorted by each topic, such as "Editorials" or "Covid Related Content".

Related

Web-Scraping using BeautifulSoup (missing values when scraping)

I have been trying to webscrape a realtor website using BeautifulSoup and encountered 2 difficulties that I cannot seem to fix.
Difficulties:
When I run my code below, I am missing some date values. The dataframe should hold 68 rows of data scraped from the first page. The description and title scrapes return 68 rows, but the date scrape returns 66. I don't get N/A values returned if its missing either. Does anyone have an idea why this is? When I inspected the website elements it had the same tags, except it is listed as VIP or Special (promotion) apartments.
Secondly, I cannot seem to figure out how to scrape meta itemprop tags. I keep getting blank values when I use:
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__date'}):
Thank you in advance for any assistance you could provide.
Python Code:
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup as bsoup
import ssl
import pandas as pd
def get_headers():
#Headers
headers={'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language':'en-US,en;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
return headers
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 1 # for pagination
#Make list holder
title = []
description = []
date = []
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/']
for x in urls:
count=1
y=x
while(count < 2): # will get only 1st page
print(x)
req = Request(x, headers=get_headers()) #req all headers
htmlfile = urlopen(req)
htmltext = htmlfile.read()
soup = bsoup(htmltext,'html.parser')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('a', attrs={'class':'announcement-block__title'}):
text = tag2.get_text().strip()
if len(text) > 0:
title.append(text)
else:
title.append('N/A')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__description'}):
text = tag2.get_text().strip()
if len(text) > 0:
description.append(text)
else:
description.append('N/A')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__date'}):
text = tag2.get_text().strip()
if len(text) > 0:
date.append(text)
else:
date.append('N/A')
# Go to next page
count=count+1
page = '?page='+str(count)
x=y+page
data_frame = pd.DataFrame(list(zip(title,description,date)),columns=['Title', 'Description', 'Date'])
You get 66 items because your date[] contains only 66 elements, therefore, you need to check all three fields at once in one for loop. Your if else checks do nothing as there are no announcement-block__date divs with empty content on the page.
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup as bsoup
import ssl
import pandas as pd
def get_headers():
#Headers
headers={'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language':'en-US,en;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
return headers
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 1 # for pagination
#Make list holder
info = {
'title': [],
'description': [],
'date': []
}
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/']
for x in urls:
count=1
y=x
while(count < 2): # will get only 1st page
print(x)
req = Request(x, headers=get_headers()) #req all headers
htmlfile = urlopen(req)
htmltext = htmlfile.read()
soup = bsoup(htmltext,'html.parser')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
title = tag.find('a', attrs={'class':'announcement-block__title'})
description = tag.find('div', attrs={'class':'announcement-block__description'})
date = tag.find('div', attrs={'class':'announcement-block__date'})
info['title'].append(title.get_text().strip() if title else 'N/A')
info['description'].append(description.get_text().strip() if description else 'N/A')
info['date'].append(date.get_text().strip() if date else 'N/A')
# Go to next page
count=count+1
page = '?page='+str(count)
x=y+page
data_frame = pd.DataFrame(list(zip(info['title'], info['description'], info['date'])),columns=['Title', 'Description', 'Date'])
print(len(info['title']), len(info['description']), len(info['date']))
print(data_frame)
About your second question, a similar question has already been answered here

Can't print all the results all at once

I'm trying to create a script which will fetch the title and the description of products from this webpage. In it's landing page there is a single product. However, when you take a look at the left sided area, you will notice a tab titling 17 products. I'm trying to grab their title and description as well. The tab named 17 products in reality does nothing as the 17 products are already within the page source.
I can fetch all the 18 products in the following manner. I had to use print twice to print all 18 products. If I append the results and print all them together, the script will look messier.
import requests
from bs4 import BeautifulSoup
link = 'https://www.3m.com/3M/en_US/company-us/all-3m-products/~/3M-Cubitron-II-Cut-Off-Wheel/?N=5002385+3290927385&preselect=8710644+3294059243&rt=rud'
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
product_title = soup.select_one("h1[itemprop='name']").text
specification = soup.select_one(".MMM--tabHeader:contains('Product Details') + .tabContentContainer").get_text(strip=True)[:30] #truncated for brevity
print(product_title,specification)
for additional_link in list(set([item.get("href") for item in soup.select(".js-row-results .allModelItemDetails a.SNAPS--actLink")])):
res = s.get(additional_link)
sauce = BeautifulSoup(res.text,"lxml")
product_title = sauce.select_one("h1[itemprop='name']").text
specification = sauce.select_one(".MMM--tabHeader:contains('Product Details') + .tabContentContainer").get_text(strip=True)[:30] #truncated for brevity
print(product_title,specification)
How can I print all the title and description of products all together?
Not sure if I understand your question. You want to print all of the title and descriptions together, but you don't want to append them to a list, because the script would be messy?
One option is to use a dict instead of a list. Define a dict up at the top of your code after the imports: products = {}, and swapping out your print statements with products[product_title] = specification
Afterwards, you can use the pprint package, which I believe comes with python, to neatly print the dictionary object, like so:
import pprint
some_random_dict = {'a': 123, 'b': 456}
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(stuff)
Replace some_random_dict with products
If you're concerned with neatness, I would also refactor this bit into a seperate function:
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
product_title = soup.select_one("h1[itemprop='name']").text
specification = soup.select_one(".MMM--tabHeader:contains('Product Details') + .tabContentContainer").get_text(strip=True)[:30] #truncated for brevity
Maybe something like this:
def get_product(sess, link):
info = {}
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
product_title = soup.select_one("h1[itemprop='name']").text
specification = soup.select_one(".MMM--tabHeader:contains('Product Details') + .tabContentContainer").get_text(strip=True)[:30] #truncated for brevity
info[product_title] = specification
return soup, info
Your code would then look like this:
products = {}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
soup, product_info = get_link(s, link)
products.update(product_info)
for additional_link in list(set([item.get("href") for item in soup.select(".js-row-results .allModelItemDetails a.SNAPS--actLink")])):
sauce, product_info = get_link(s, additional_link)
products.update(product_info)
Having the same piece of code pasted around in multiple places is something that should always be avoided. Refactoring that bit into a separate function will help readability and maintainability in the long run.

Data overwrites when export to Excel

I'm scraping a website to gather the ten most recent articles based on some keywords. Once I get my data (keyword used, article name, the URL/hyperlink, and the publication date) I want to write it all to an xls file. So far it only writes the results for the last keyword as opposed to all four, it's just overwriting the same section of the spreadsheet. How can I display my entire list, not just the most recent section?
import requests
from bs4 import BeautifulSoup
import datetime
import xlwt
from xlwt import Formula
today = datetime.date.today().strftime("%Y%m%d")
keywords = ('PNC', 'Huntington', 'KeyCorp', 'Fifth Third')
for keyword in keywords:
keyword.replace("+", " ")
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
def article_fetch(keyword):
url = 'https://www.americanbanker.com/search?query={}'.format(keyword)
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
mylist = []
cols = "KeyWord", "Article", "URL", "Publication Date"
mylist.append(cols)
for articles in soup.find_all("div", "feed-item"):
article = articles.find("h4").text.strip()
timestamp = articles.find("span", "timestamp").text.strip()
article_url = 'https://{}'.format(articles.find("a")["href"][2:])
link = 'HYPERLINK("{}", "Link" )'.format(article_url)
item = [keyword, article, Formula(link), timestamp]
mylist.append(item)
book = xlwt.Workbook()
sheet = book.add_sheet("Articles")
for i, row in enumerate(mylist):
for j, col in enumerate(row):
sheet.write(i, j, col)
book.save("C:\Python\American Banker\American Banker {}.xls".format(today))
for keyword in keywords:
article_fetch(keyword)
print('Workbook Saved')
I would expect to see my entire list, which would have results for all four keywords. However I am only seeing the results for the last keyword.
I've moved Excel file generation to the end of the script:
import requests
from bs4 import BeautifulSoup
import datetime
import xlwt
from xlwt import Formula
today = datetime.date.today().strftime("%Y%m%d")
keywords = ('PNC', 'Huntington', 'KeyCorp', 'Fifth Third')
for keyword in keywords:
keyword.replace("+", " ")
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
def article_fetch(keyword):
url = 'https://www.americanbanker.com/search?query={}'.format(keyword)
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
for articles in soup.find_all("div", "feed-item"):
article = articles.find("h4").text.strip()
timestamp = articles.find("span", "timestamp").text.strip()
article_url = 'https://{}'.format(articles.find("a")["href"][2:])
link = 'HYPERLINK("{}", "Link" )'.format(article_url)
item = [keyword, article, Formula(link), timestamp]
mylist.append(item)
mylist = []
cols = "KeyWord", "Article", "URL", "Publication Date"
mylist.append(cols)
for keyword in keywords:
article_fetch(keyword)
book = xlwt.Workbook()
sheet = book.add_sheet('Articles')
for i, row in enumerate(mylist):
for j, col in enumerate(row):
sheet.write(i, j, col)
book.save("American Banker {}.xls".format(today))
print('Workbook Saved')
Data won't lost anymore:

error in scraping inner tags of html element using python

Recently i am working on exercise and in which i had extracted whole webpage source data . I am very much interested in area tag . In area tag i am very much interested in onclick attribute . Now how can we extract onclick attribute from particular element .
Now our extracted data is looking like these ,
<area class="borderimage" coords="21.32,14.4,933.96,180.56" href="javascript:void(0);" onclick="return show_pop('78545','51022929357','1')" onmouseover="borderit(this,'black','<b>इंदौर, गुरुवार, 10 मई , 2018 <b><br><bआप पढ़ रहे हैं देश का सबसे व...')" onmouseout="borderit(this,'white')" alt="<b>इंदौर, गुरुवार, 10 मई , 2018 <b><br><bआप पढ़ रहे हैं देश का सबसे व..." shape="rect">
I am very much interested in onclick attribute and my code is like these which i had already done but nothing has worked ,
paper_url = 'http://epaper.bhaskar.com/indore/129/10052018/mpcg/1/'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
# Total number of pages available in these product
page = requests.get(paper_url,headers = headers)
page_response = page.text
parser = html.fromstring(page_response)
XPATH_Total_Pages = '//div[contains(#class,"fs12 fta w100 co_wh pdt5")]//text()'
raw_total_pages = parser.xpath(XPATH_Total_Pages)
lastpage=raw_total_pages[-1]
print(int(lastpage))
finallastpage=int(lastpage)
reviews_list = []
XPATH_PRODUCT_NAME = '//map[contains(#name,"Mapl")]'
#XPATH_PRODUCT_PRICE = '//span[#id="priceblock_ourprice"]/text()'
#raw_product_price = parser.xpath(XPATH_PRODUCT_PRICE)
#product_price = raw_product_price
raw_product_name = parser.xpath(XPATH_PRODUCT_NAME)
XPATH_REVIEW_SECTION_2 = '//area[#class="borderimage"]'
reviews = parser.xpath(XPATH_REVIEW_SECTION_2)
product_name =raw_product_name
#result = product_name.find(',')
#finalproductname = slice[0:product_name]
print(product_name)
print(reviews)
for review in reviews:
#soup = BeautifulSoup(str(review), "html.parser")
#parser2.feed(str(review))
#allattr = [tag.attrs for tag in review.findAll('onclick')]
#print(allattr)
XPATH_RATING = './/area[#data-hook="onclick"]'
raw_review_rating = review.xpath(XPATH_RATING)
#cleaning data
print(raw_review_rating)
If I got it right - you need to get all onclick attributes of <area> tags on a page.
Try something like this:
import requests
from bs4 import BeautifulSoup
TAG_NAME = 'area'
ATTR_NAME = 'onclick'
url = 'http://epaper.bhaskar.com/indore/129/10052018/mpcg/1/'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
# there are 3 <area> tags on page; putting them into a list
area_onclick_attrs = [x[ATTR_NAME] for x in soup.findAll(TAG_NAME)]
print(area_onclick_attrs)
Output:
[
"return show_pophead('78545','51022929357','1')",
"return show_pop('78545','51022928950','4')",
"return show_pop('78545','51022929357','1')",
]

Check if a specific class and value exist in HTML using beautifulsoup Python

I'm writing a script for scrape the website "yelp.fr" but to scrap the number of stars the class is generating automaticly:
class="i-stars i-stars--regular-4 rating-large" ==> 4 starts
class="i-stars i-stars--regular-3-half rating-large" ==> 3.5
My question how i can do this? and how can i etst if class exist or not on the html page
CITIES = "la rochelle(17000)"
places = "Bars"
driver = webdriver.Chrome()
driver.get("https://www.yelp.fr/search?find_desc="+places+"&find_loc="+CITIES+"")
page = driver.page_source
soup = BeautifulSoup(page,"lxml")
etoiles=soup.find_all("div",{"class":"biz-rating biz-rating-large clearfix"})
etoiles.get_attribute("title")
if etoiles:
print "ok"
else:
print "not "
some times class biz-rating biz-rating-large clearfix not exist as bellow
The title of that DIV contains the number of stars/rating. You can get it like
ratings = soup.select(".i-stars")
for rating in ratings:
print rating.attrs['title']
I solve the probleme using this:
yelp_url = "https://www.yelp.com/search?find_desc=%s&find_loc=%s&start=%s"%(place,city,str(id))
headers1 = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
response1 = requests.get(yelp_url).text
parser = html.fromstring(response1)
print "Parsing the page"
listing1 = parser.xpath("//li[#class='regular-search-result']")
for results in listing1:
if raw_ratings:
ratings = re.findall("\d+[.,]?\d+",cleaned_ratings)[0]
else:
ratings = 0
price_range = len(''.join(raw_price_range)) if raw_price_range else 0
address = ' '.join(' '.join(raw_address).split())
address=unidecode(address)
reservation_available = True if is_reservation_available else False
accept_pickup = True if is_accept_pickup else False
raw_review_count = results.xpath(".//span[contains(#class,'review-count')]//text()")
raw_price_range = results.xpath(".//span[contains(#class,'price-range')]//text()")
if raw_ratings:
ratings = re.findall("\d+[.,]?\d+",cleaned_ratings)[0]
else:
ratings = 0
price_range = len(''.join(raw_price_range)) if raw_price_range else 0

Categories

Resources