Python Web Scraping - Is it not possible to scrape this site? - python

I want to scrape the following website: https://www.globenewswire.com/NewsRoom
My goal is to store the press releases and articles in a database that I utilize later on. I've done this with other news sites too and deleted the code on here for easier readability (100% no influence on the code given to you). My problem is that I can't figure out how to exactly scrape headlines, links and other data since the html-code is structured with unusual attributes.
The following code is how I approached it. Maybe someone has an idea on what mistakes I did in scraping. Gladly appreciate any help.
import requests
import sqlite3
import Keywords
from bs4 import BeautifulSoup
from time import sleep
from random import randint
from datetime import datetime
from datetime import timedelta
# ----- Initializing Database & Notification Service -----
connect = sqlite3.connect('StoredArticles.db')
cursor = connect.cursor()
print("Connection created.")
try:
cursor.execute('''CREATE TABLE articlestable (article_time TEXT, article_title TEXT, article_keyword TEXT,
article_link TEXT, article_description TEXT, article_entry_time DATETIME)''')
cursor.execute('''CREATE UNIQUE INDEX index_article_link ON articlestable(article_link)''')
except:
pass
print("Table ready.")
while True:
class Scrapers:
# ----- Initialize Keywords -----
def __init__(self):
self.article_keyword = None
self.article_title = None
self.article_link = None
self.article_time = None
self.article_time_drop = None
self.article_description = None
self.article_entry_time = None
self.headers = {
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko)' +
'Version/14.0.1 Safari/605.1.15'
}
def scraping_globenewswire(self, page):
url = 'https://www.globenewswire.com/NewsRoom?page=' + str(page)
r = requests.get(url, headers=self.headers)
soup = BeautifulSoup(r.text, 'html.parser')
articles = soup.select('.main-container > .row')
print("GlobeNewswire - Scraping page " + str(page) + "...")
sleep(randint(0, 1))
for item in articles:
self.article_title = item.select_one('a[data-autid="article-url"]').text.strip()
self.article_time = item.select_one('span[data-autid="article-published-date"]').text.strip()
self.article_link = 'https://www.globenewswire.com' + \
item.select_one('a[data-autid="article-url"]')['href']
self.article_description = item.select_one('span', _class='pagging-list-item-text-body').text.strip()
self.article_entry_time = datetime.now()
cursor.execute('''INSERT OR IGNORE INTO articlestable VALUES(?,?,?,?,?,?)''',
(self.article_time, self.article_title, self.article_keyword, self.article_link,
self.article_description, self.article_entry_time))
print(self.article_title)
return
# ----- End of Loops -----
scraper = Scrapers()
# ----- Range of Pages to scrape through -----
for x in range(1, 3):
scraper.scraping_globenewswire(x)
# ----- Add to Database -----
connect.commit()
print("Process done. Starting to sleep again. Time: " + str(datetime.now()))
sleep(randint(5, 12))

I extracted all the headlines of page=1 from the given URL.
The headlines are present inside an <a> with the attribue data-autid equals to article-url
Select all the <a> with the above attributes using findAll().
Iterate over all the selected <a> above and extract the headlines i.e, text
You can extend this and extract whatever data you need with this approach.
This code will print all the headlines of page=1 from the given URL.
import requests
import bs4 as bs
url = 'https://www.globenewswire.com/NewsRoom'
resp = requests.get(url)
soup = bs.BeautifulSoup(resp.text, 'lxml')
headlines = soup.findAll('a', attrs={'data-autid': 'article-url'})
for i in headlines:
print(i.text, end="\n")

Related

How do you iterate over BS4 elements that has the same name?

It only scrapes the first table and I'm not sure on how to get it to scrape the second, they both have the same class.
from bs4 import BeautifulSoup
import requests
def getCalendarData(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for table in soup.find_all('table',class_ = 'ms-schedule-table ms-schedule-table--your' ):
for event in table.find_all('tbody'):
Series = event.find('div',class_ = 'ms-schedule-table-item-main__title').text.strip()
Circuit = event.find('div',class_ = 'ms-schedule-table-item-main__event').text.strip()
Month = event.find('span',class_ = 'ms-schedule-table-date__month').text.strip()
Day = event.find('span',class_ = 'ms-schedule-table-date__day').text.strip()
print(Series,Circuit,Month,Day)
getCalendarData('https://www.motorsport.com/all/schedule/2022/upcoming/')
Your question is misleading, there is no second table on this page, there is only the option to load more data.
Unless you want to switch to selenium, you can also address the resource from which the data is dynamically reloaded.
for p in range(1,3,1):
getCalendarData(f'https://www.motorsport.com/all/schedule/2022/upcoming/?all_event_types=1&p={p}')
Example
A bit more generic with while-loop, to check if there is a load more button:
from bs4 import BeautifulSoup
import requests
url = 'https://www.motorsport.com/all/schedule/2022/upcoming/'
def getCalendarData(table):
for event in table.find_all('tbody'):
Series = event.find('div',class_ = 'ms-schedule-table-item-main__title').text.strip()
Circuit = event.find('div',class_ = 'ms-schedule-table-item-main__event').text.strip()
Month = event.find('span',class_ = 'ms-schedule-table-date__month').text.strip()
Day = event.find('span',class_ = 'ms-schedule-table-date__day').text.strip()
print(Series,Circuit,Month,Day)
while True:
print(f'Scraping url: {url}')
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
getCalendarData(soup.find('table',class_ = 'ms-schedule-table ms-schedule-table--your'))
if soup.select_one('[data-id="nextPage"]'):
url = 'https://www.motorsport.com/'+soup.select_one('[data-id="nextPage"]').get('href')
else:
break

How can I get data from a website using BeautifulSoup and requests?

I am a beginner in web scraping, and I need help with this problem.
The website, allrecipes.com, is a website where you can find recipes based on a search, which in this case is 'pie':
link to the html file:
'view-source:https://www.allrecipes.com/search/results/?wt=pie&sort=re'
(right click-> view page source)
I want to create a program that takes a input, searches it up on allrecipes, and returns a list with tuples of the first five recipes with data such as the time that takes to make, serving yield, ingrediants, and more.
This is my program so far:
import requests
from bs4 import BeautifulSoup
def searchdata():
inp=input('what recipe would you like to search')
url ='http://www.allrecipes.com/search/results/?wt='+str(inp)+'&sort=re'
r=requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links=[]
#fill in code for finding top 3 or five links
for i in range(3)
a = requests.get(links[i])
soupa = BeautifulSoup(a.text, 'html.parser')
#fill in code to find name, ingrediants, time, and serving size with data from soupa
names=[]
time=[]
servings=[]
ratings=[]
ingrediants=[]
searchdata()
Yes, i know, my code is very messy but What should I fill in in the two code fill-in areas?
Thanks
After searching for the recipe you have to get the links of each recipe and then request again for each of those links, because the information you're looking for is not available on the search page. That would not look clean without OOP so here's the class I wrote that does what you want.
import requests
from time import sleep
from bs4 import BeautifulSoup
class Scraper:
links = []
names = []
def get_url(self, url):
url = requests.get(url)
self.soup = BeautifulSoup(url.content, 'html.parser')
def print_info(self, name):
self.get_url(f'https://www.allrecipes.com/search/results/?wt={name}&sort=re')
if self.soup.find('span', class_='subtext').text.strip()[0] == '0':
print(f'No recipes found for {name}')
return
results = self.soup.find('section', id='fixedGridSection')
articles = results.find_all('article')
texts = []
for article in articles:
txt = article.find('h3', class_='fixed-recipe-card__h3')
if txt:
if len(texts) < 5:
texts.append(txt)
else:
break
self.links = [txt.a['href'] for txt in texts]
self.names = [txt.a.span.text for txt in texts]
self.get_data()
def get_data(self):
for i, link in enumerate(self.links):
self.get_url(link)
print('-' * 4 + self.names[i] + '-' * 4)
info_names = [div.text.strip() for div in self.soup.find_all(
'div', class_='recipe-meta-item-header')]
ingredient_spans = self.soup.find_all('span', class_='ingredients-item-name')
ingredients = [span.text.strip() for span in ingredient_spans]
for i, div in enumerate(self.soup.find_all('div', class_='recipe-meta-item-body')):
print(info_names[i].capitalize(), div.text.strip())
print()
print('Ingredients'.center(len(ingredients[0]), ' '))
print('\n'.join(ingredients))
print()
print('*' * 50, end='\n\n')
chrome = Scraper()
chrome.print_info(input('What recipe would you like to search: '))

Object Subscriptable

While scraping the website, I am getting this error:
links = [tag.a["href"] for tag in soup.find_all('strong')[1:-3]]
TypeError: 'NoneType' object is not subscriptable
Code:
import requests
import concurrent.futures
from bs4 import BeautifulSoup
HOST = "https://www.lyrics.com"
url = "https://www.lyrics.com/album/3769520/Now+20th+Anniversary%2C+Vol.+2"
# Parse the initial 'album' website
req = requests.get(url)
html = req.content
soup = BeautifulSoup(html , 'html.parser')
# Find all song's links in 'album' site - these can be found under
# the 'strong' tab, and 'a' tab
links = [tag.a["href"] for tag in soup.find_all('strong')[1:-3]]
name = []
def getLyrics(url):
url = HOST + url # songs are found on the HOST website
# Parse 'song' site
req = requests.get(url)
html = req.content
soup = BeautifulSoup(html , 'html.parser')
# Obtain the lyrics, which can be found under the 'pre' tab
names = soup.find('h1',{"id":"lyric-title-text"})
name.append((names.text)+".txt")
return soup.find('pre').text
# Use multi-threading for faster performance - I'll give a small run down:
# max_workers = number of threads - we use an individual thread for each song
lyric = []
with concurrent.futures.ThreadPoolExecutor(max_workers=len(links)) as executor:
# for every song...
for j in range(len(links)):
# run the 'getLyrics' method on an individual thread and get the lyrics
lyrics = executor.submit(getLyrics, links[j]).result()
print(lyrics)
lyric.append(lyrics)
# do whatever with the lyrics ... I simply printed them
for i in range(0 ,len(name)-1):
File = open(name[i],"w")
File.write(lyric[i])
File.close()
I will be very thankful if you could help me.
you can check if tag.a is not None:
links = [tag.a['href'] for tag in soup.find_all('strong') if tag.a is not None][1:-3]
print(links)
# output ['/lyric/35873929/Tik+Tok+%5BNOW+33%5D', ...]

How to efficiently parse large HTML div-class and span data on Python BeautifulSoup?

The data needed:
I want to scrape through two webpages, one here: https://finance.yahoo.com/quote/AAPL/balance-sheet?p=AAPL and the other: https://finance.yahoo.com/quote/AAPL/financials?p=AAPL.
From the first page, I need values of the row called Total Assets. This would be 5 values in that row named: 365,725,000 375,319,000 321,686,000 290,479,000 231,839,000
Then I need 5 values of the row named Total Current Liabilities. These would be: 43,658,000 38,542,000 27,970,000 20,722,000 11,506,000
From the second link, I need 10 values of the row named Operating Income or Loss. These would be: 52,503,000 48,999,000 55,241,000 33,790,000 18,385,000.
EDIT: I need the TTM value too, and then the five years' values mentioned above. Thanks.
Here is the logic of what I want. I want to run this module, and when run, I want the output to be:
TTM array: 365725000, 116866000, 64423000
year1 array: 375319000, 100814000, 70898000
year2 array: 321686000, 79006000, 80610000
My code:
This is what I have written so far. I can extract the value within the div class if I just put it in a variable as shown below. However, how do I loop efficiently through the 'div' classes as there are thousands of them in the page. In other words, how do I find just the values I am looking for?
# Import libraries
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
# Set the URL you want to webscrape from
url = 'https://finance.yahoo.com/quote/AAPL/balance-sheet?p=AAPL'
# Connect to the URL
response = requests.get(url)
# Parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(response.text, "html.parser")
soup1 = BeautifulSoup("""<div class="D(tbc) Ta(end) Pstart(6px) Pend(4px) Bxz(bb) Py(8px) BdB Bdc($seperatorColor) Miw(90px) Miw(110px)--pnclg" data-test="fin-col"><span>321,686,000</span></div>""", "html.parser")
spup2 = BeautifulSoup("""<span data-reactid="1377">""", "html.parser");
#This works
print(soup1.find("div", class_="D(tbc) Ta(end) Pstart(6px) Pend(4px) Bxz(bb) Py(8px) BdB Bdc($seperatorColor) Miw(90px) Miw(110px)--pnclg").text)
#How to loop through all the relevant div classes?
EDIT - At the request of #Life is complex, edited to add date headings.
Try this using lxml:
import requests
from lxml import html
url = 'https://finance.yahoo.com/quote/AAPL/balance-sheet?p=AAPL'
url2 = 'https://finance.yahoo.com/quote/AAPL/financials?p=AAPL'
page = requests.get(url)
page2 = requests.get(url2)
tree = html.fromstring(page.content)
tree2 = html.fromstring(page2.content)
total_assets = []
Total_Current_Liabilities = []
Operating_Income_or_Loss = []
heads = []
path = '//div[#class="rw-expnded"][#data-test="fin-row"][#data-reactid]'
data_path = '../../div/span/text()'
heads_path = '//div[contains(#class,"D(ib) Fw(b) Ta(end)")]/span/text()'
dats = [tree.xpath(path),tree2.xpath(path)]
for entry in dats:
heads.append(entry[0].xpath(heads_path))
for d in entry[0]:
for s in d.xpath('//div[#title]'):
if s.attrib['title'] == 'Total Assets':
total_assets.append(s.xpath(data_path))
if s.attrib['title'] == 'Total Current Liabilities':
Total_Current_Liabilities.append(s.xpath(data_path))
if s.attrib['title'] == 'Operating Income or Loss':
Operating_Income_or_Loss.append(s.xpath(data_path))
del total_assets[0]
del Total_Current_Liabilities[0]
del Operating_Income_or_Loss[0]
print('Date Total Assets Total_Current_Liabilities:')
for date,asset,current in zip(heads[0],total_assets[0],Total_Current_Liabilities[0]):
print(date, asset, current)
print('Operating Income or Loss:')
for head,income in zip(heads[1],Operating_Income_or_Loss[0]):
print(head,income)
Output:
Date Total Assets Total_Current_Liabilities:
9/29/2018 365,725,000 116,866,000
9/29/2017 375,319,000 100,814,000
9/29/2016 321,686,000 79,006,000
Operating Income or Loss:
ttm 64,423,000
9/29/2018 70,898,000
9/29/2017 61,344,000
9/29/2016 60,024,000
Of course, if so desired, this can be easily incorporated into a pandas dataframe.
Some suggestions for parse html use 'BeautifulSoup' which is helpful for me maybe helpful for you.
use 'id' to location the element, instead of using 'class' because the 'class' change more frequently than id.
use structure info to location the element instead of using 'class', the structure info change less frequently.
use headers with user-agent info to get response is always better than no headers. In this case, if do not specify headers info, you can not find id 'Col1-1-Financials-Proxy', but you can find 'Col1-3-Financials-Proxy', which is not same with result in Chrome inspector.
Here is runnable codes for your requirement use structure info to location elements. You definitely can use 'class' info to make it. Just remember that when your code do not work well, check the website's source code.
# import libraries
import requests
from bs4 import BeautifulSoup
# set the URL you want to webscrape from
first_page_url = 'https://finance.yahoo.com/quote/AAPL/balance-sheet?p=AAPL'
second_page_url = 'https://finance.yahoo.com/quote/AAPL/financials?p=AAPL'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
#################
# first page
#################
print('*' * 10, ' FIRST PAGE RESULT ', '*' * 10)
total_assets = {}
total_current_liabilities = {}
operating_income_or_loss = {}
page1_table_keys = []
page2_table_keys = []
# connect to the first page URL
response = requests.get(first_page_url, headers=headers)
# parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(response.text, "html.parser")
# the nearest id to get the result
sheet = soup.find(id='Col1-1-Financials-Proxy')
sheet_section_divs = sheet.section.find_all('div', recursive=False)
# last child
sheet_data_div = sheet_section_divs[-1]
div_ele_table = sheet_data_div.find('div').find('div').find_all('div', recursive=False)
# table header
div_ele_header = div_ele_table[0].find('div').find_all('div', recursive=False)
# first element is label, the remaining element containing data, so use range(1, len())
for i in range(1, len(div_ele_header)):
page1_table_keys.append(div_ele_header[i].find('span').text)
# table body
div_ele = div_ele_table[-1]
div_eles = div_ele.find_all('div', recursive=False)
tgt_div_ele1 = div_eles[0].find_all('div', recursive=False)[-1]
tgt_div_ele1_row = tgt_div_ele1.find_all('div', recursive=False)[-1]
tgt_div_ele1_row_eles = tgt_div_ele1_row.find('div').find_all('div', recursive=False)
# first element is label, the remaining element containing data, so use range(1, len())
for i in range(1, len(tgt_div_ele1_row_eles)):
total_assets[page1_table_keys[i - 1]] = tgt_div_ele1_row_eles[i].find('span').text
tgt_div_ele2 = div_eles[1].find_all('div', recursive=False)[-1]
tgt_div_ele2 = tgt_div_ele2.find('div').find_all('div', recursive=False)[-1]
tgt_div_ele2 = tgt_div_ele2.find('div').find_all('div', recursive=False)[-1]
tgt_div_ele2_row = tgt_div_ele2.find_all('div', recursive=False)[-1]
tgt_div_ele2_row_eles = tgt_div_ele2_row.find('div').find_all('div', recursive=False)
# first element is label, the remaining element containing data, so use range(1, len())
for i in range(1, len(tgt_div_ele2_row_eles)):
total_current_liabilities[page1_table_keys[i - 1]] = tgt_div_ele2_row_eles[i].find('span').text
print('Total Assets', total_assets)
print('Total Current Liabilities', total_current_liabilities)
#################
# second page, same logic as the first page
#################
print('*' * 10, ' SECOND PAGE RESULT ', '*' * 10)
# Connect to the second page URL
response = requests.get(second_page_url, headers=headers)
# Parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(response.text, "html.parser")
# the nearest id to get the result
sheet = soup.find(id='Col1-1-Financials-Proxy')
sheet_section_divs = sheet.section.find_all('div', recursive=False)
# last child
sheet_data_div = sheet_section_divs[-1]
div_ele_table = sheet_data_div.find('div').find('div').find_all('div', recursive=False)
# table header
div_ele_header = div_ele_table[0].find('div').find_all('div', recursive=False)
# first element is label, the remaining element containing data, so use range(1, len())
for i in range(1, len(div_ele_header)):
page2_table_keys.append(div_ele_header[i].find('span').text)
# table body
div_ele = div_ele_table[-1]
div_eles = div_ele.find_all('div', recursive=False)
tgt_div_ele_row = div_eles[4]
tgt_div_ele_row_eles = tgt_div_ele_row.find('div').find_all('div', recursive=False)
for i in range(1, len(tgt_div_ele_row_eles)):
operating_income_or_loss[page2_table_keys[i - 1]] = tgt_div_ele_row_eles[i].find('span').text
print('Operating Income or Loss', operating_income_or_loss)
Output with header info:
********** FIRST PAGE RESULT **********
Total Assets {'9/29/2018': '365,725,000', '9/29/2017': '375,319,000', '9/29/2016': '321,686,000'}
Total Current Liabilities {'9/29/2018': '116,866,000', '9/29/2017': '100,814,000', '9/29/2016': '79,006,000'}
********** SECOND PAGE RESULT **********
Operating Income or Loss {'ttm': '64,423,000', '9/29/2018': '70,898,000', '9/29/2017': '61,344,000', '9/29/2016': '60,024,000'}

Scrape page with generator

I scraping a site with Beautiful Soup. The problem I have is that certain parts of the site are paginated with JS, with an unknown (varying) number of pages to scrape.
I'm trying to get around this with a generator, but it's my first time writing one and I'm having a hard time wrapping my head around it and figuring out if what I'm doing makes sense.
Code:
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
tlds = csv.reader(open("top_level_domains.csv", 'r'), delimiter=';')
sites = csv.writer(open("websites_to_scrape.csv", "w"), delimiter=',')
tld = "uz"
has_next = True
page = 0
def create_link(tld, page):
if page == 0:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain"
else:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain/page/" + repr(page)
return link
def check_for_next(soup):
disabled_nav = soup.find(class_="pagingDivDisabled")
if disabled_nav:
if "Next" in disabled_nav:
return False
else:
return True
else:
return True
def make_soup(link):
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
return soup
def all_the_pages(counter):
while True:
link = create_link(tld, counter)
soup = make_soup(link)
if check_for_next(soup) == True:
yield counter
else:
break
counter += 1
def scrape_page(soup):
table = soup.find('table', {'class': 'rankTable'})
th = table.find('tbody')
test = th.find_all("td")
correct_cells = range(1,len(test),3)
for cell in correct_cells:
#print test[cell]
url = repr(test[cell])
content = re.sub("<[^>]*>", "", url)
sites.writerow([tld]+[content])
def main():
for page in all_the_pages(0):
print page
link = create_link(tld, page)
print link
soup = make_soup(link)
scrape_page(soup)
main()
My thinking behind the code:
The scraper should get the page, determine if there is another page that follows, scrape the current page and move to the next one, repreating the process. If there is no next page, it should stop. Does that make sense how I'm going it here?
As I told you, you could use selenium for programmatically clicking on the Next button, but since that is not an option for you, I can think of the following method to get the number of pages using pure BS4:
import requests
from bs4 import BeautifulSoup
def page_count():
pages = 1
url = "https://domaintyper.com/top-websites/most-popular-websites-with-uz-domain/page/{}"
while True:
html = requests.get(url.format(pages)).content
soup = BeautifulSoup(html)
table = soup.find('table', {'class': 'rankTable'})
if len(table.find_all('tr')) <= 1:
return pages
pages += 1

Categories

Resources