Unable to scrape data after a dropdown - python

from typing import Text, final
from bs4 import BeautifulSoup
import requests
source = requests.get("https://www.nytimes.com/interactive/2021/us/covid-cases.html").text
soup = BeautifulSoup(source, "lxml")
states = soup.find("tbody", class_="children").find_all("tr")
# print(state.prettify())
for state in states:
# determining the name of the state
name = state.a.text
final_name = ""
for character in name:
if character in "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM ":
final_name += character
print(final_name)
# finding the daily number of cases on average
try:
daily_cases_avg = state.find("td",class_="bignum cases show-mobile").text
except Exception as e:
daily_cases_avg = None
print(daily_cases_avg)
# finding the number of cases per 100,000
try:
num_cases_per_hunThous = state.find("td",class_="num cases show-mobile").text
except Exception as e:
num_cases_per_hunThous = None
print(num_cases_per_hunThous)
# finding percent change over the past 14 days
try:
pct_change_cases_14 = state.find("td",class_="chart cases wider td-end show-mobile").span.text
except Exception as e:
pct_change_cases_14 = None
print(pct_change_cases_14)
# daily average of the number of people hospitalized
try:
daliy_hos_avg = state.find_all("td",class_="bignum")[1].text
except Exception as e:
daily_hos_avg = None
print(daliy_hos_avg)
# number of people people hospitalized per 100,000
try:
num_hos_hunThous = state.find_all("td",class_="num")[1].text
except Exception as e:
num_hos_hunThous = None
print(num_hos_hunThous)
# percent change of number of hospitalized people over the past 14 days
try:
pct_change_hos_14 = state.find("td",class_="num td-end").text
except Exception as e:
pct_change_hos_14 = None
print(pct_change_hos_14)
# daily average of deaths
try:
daily_death_avg = state.find_all("td",class_="bignum")[2].text
except Exception as e:
daily_death_avg = None
print(daily_death_avg)
# number of deaths per 100,000
try:
deaths_hunThous = state.find_all("td",class_="num td-end")[1].text
except Exception as e:
deaths_hunThous = None
print(deaths_hunThous)
# percent of people fully vaccinated
try:
pct_vac = state.find("td",class_="num vax td-end").text
except Exception as e:
pct_vac = None
print(pct_vac)
All I am trying to do is scrape COVID-19 data off of the New York Times. I am a beginner so I am just using this as a way to learn how to scrape websites efficiently. However, the website only the states that show up prior to a dropdown.
On the website, after the state of Illinois, there is button "Show all." The states that appear after clicking that button are not getting scraped for data, so I was wondering how I can get past that to get data for all of the states.

If you open developer tools and go to network you can see all of the requests the page is sending. I found one request it sends https://static01.nyt.com/newsgraphics/2021/coronavirus-tracking/data/counties.json
The website recieves a link for every county. Each element in the json obj contains a link to another nyt article that contains the avg cases for that individual county.
It would be more complicated but you could write a script that goes through each of these countys and scrapes the data. Then add up the avg cases for each state based on each individual county.
That is what I would do

Related

Python Web Scraping error - Reading from JSON- IndexError: list index out of range - how do I ignore

I am performing web scraping via Python \ Selenium \ Chrome headless driver. I am reading the results from JSON - here is my code:
CustId=500
while (CustId<=510):
print(CustId)
# Part 1: Customer REST call:
urlg = f'https://mywebsite/customerRest/show/?id={CustId}'
driver.get(urlg)
soup = BeautifulSoup(driver.page_source,"lxml")
dict_from_json = json.loads(soup.find("body").text)
# print(dict_from_json)
#try:
CustID = (dict_from_json['customerAddressCreateCommand']['customerId'])
# Addr = (dict_from_json['customerShowCommand']['customerAddressShowCommandSet'][0]['addressDisplayName'])
writefunction()
CustId = CustId+1
The issue is sometimes 'addressDisplayName' will be present in the result set and sometimes not. If its not, it errors with the error:
IndexError: list index out of range
Which makes sense, as it doesn't exist. How do I ignore this though - so if 'addressDisplayName' doesn't exist just continue with the loop? I've tried using a TRY but the code still stops executing.
try..except block should resolved your issue.
CustId=500
while (CustId<=510):
print(CustId)
# Part 1: Customer REST call:
urlg = f'https://mywebsite/customerRest/show/?id={CustId}'
driver.get(urlg)
soup = BeautifulSoup(driver.page_source,"lxml")
dict_from_json = json.loads(soup.find("body").text)
# print(dict_from_json)
CustID = (dict_from_json['customerAddressCreateCommand']['customerId'])
try:
Addr = (dict_from_json['customerShowCommand']['customerAddressShowCommandSet'][0]'addressDisplayName'])
except:
Addr ="NaN"
CustId = CustId+1
If you get an IndexError (with an index of '0') it means that your list is empty. So it is one step in the path earlier (otherwise you'd get a KeyError if 'addressDisplayName' was missing from the dict).
You can check if the list has elements:
if dict_from_json['customerShowCommand']['customerAddressShowCommandSet']:
# get the data
Otherwise you can indeed use try..except:
try:
# get the data
except IndexError, KeyError:
# handle missing data

how to make except only one value of some in selenium?

I want to find title, address, price of some items in an online mall.
But, sometimes the address is empty and my code is break in my code(below_it's an only selenium part)
num = 1
while 1:
try:
title = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/span').text
datas_title.append(title)
address = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/div/p[2]').text
datas_address.append(address)
price = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/p').text
datas_price.append(price)
print('crowling....num = '+str(num))
num=num+1
except Exception as e:
print("finish get data...")
break
print(datas_title)
print(datas_address)
print(datas_price)
what should I do if the address is empty -> just ignore it and find the next items?
Use this so you can skip the entries with missing information:
num = 1
while 1:
try:
title = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/span').text
datas_title.append(title)
address = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/div/p[2]').text
datas_address.append(address)
price = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/p').text
datas_price.append(price)
print('crowling....num = '+str(num))
num=num+1
except:
print("an error was encountered")
continue
print(datas_title)
print(datas_address)
print(datas_price)
address = browser.find_element_by_xpath('//*[#id="root"]/div[1]/section/article/div/div['+str(num)+']/div/div/a/div/p[2]').text
if not address:
address = "None"
else:
address = address[0].text
datas_title.append(address)
You could use find_elements to check if it's empty and then proceed to do it with either value. You can than encapsulate this into a function pass it the xpath and the data_title array and your code should be repeatable.
I think you need to first check if the web element returned isn't none. And then proceed with fetching text.
You could write a function for it, and catch that exception in it.

How to return another value (probably nil/nan) if a condition is not met

I'm trying to fetch product info on different pages from a site. I wrote a function to loop through each product and collect the information. when it reaches some page the pattern changed, and the function could not fetch the info but ran into an error. how can I make the code return a nan value if the condition set is not met?
Here is my function:
def get_quality_rating_info(links):
Quality_rating = []
ratings = []
while len(Quality_rating) < len(links):
for link in links:
print("Progress: Size of data collected {}... {}/{}".format(str(len(Quality_rating)),str(len(Quality_rating)),str(len(links))))
product_url = requests.get(link)
new_soup = BeautifulSoup(product_url.text, "html.parser")
for i in range(4):
quality = new_soup.findAll("div", {"class":"value"})[i]
ratings.append(str(quality.text.strip()))
Quality_rating.append(ratings[0])
###Changes the list to null after appending ratings
rating = []
print("Successfully collected Quality rating data!")
return Quality_rating
one of the links before the page style changed: https://www.jdpower.com/detail/2020/dodge/charger/flagstaff-az/2c3cdxhg6lh245286
one of the links after it changed: https://www.jdpower.com/detail/2020/ram/1500/bakersfield-ca/1c6rrebg4ln267192
Here is the error info error info
Wrap the entire block that might cause exception in try...except statement, and in except block return what you desire, e.g. None.
https://docs.python.org/3/tutorial/errors.html

Avoiding rate limit error using tweepy

I am trying to get some basic information on all the twitter friends that a particular user is following. I am using a for loop to get this information but if the user has many friends I get a rate limit error. I am trying and struggling to integrate a way to get around the rate limit into my for loop. Thank you for any suggestions!!
My original code:
data = []
for follower in followers:
carrot = api.get_user(follower)
data.append("[")
data.append(carrot.screen_name)
data.append(carrot.description)
data.append("]")
My attempt at getting around rate limit error:
data = []
for follower in followers:
carrot = api.get_user(follower,include_entities=True)
while True:
try:
data.append("[")
data.append(carrot.screen_name)
data.append(carrot.description)
data.append("]")
except tweepy.TweepError:
time.sleep(60 * 15)
continue
except StopIteration:
break
The problem could be it is probably get_user that is throwing the error. Try putting api.get_user in your exception block
Code below.
data = []
for follower in followers:
while True:
try:
carrot = api.get_user(follower,include_entities=True)
except tweepy.TweepError:
time.sleep(60 * 15)
continue
except StopIteration:
pass
break
data.append([carrot.screen_name, carrot.description])
How do you intend to store these values? Isn't the following easier to work with
[John, Traveller]
as against your code, that stores it as
[ "[", John, Traveller, "]" ]

Fetching the first image from a website that belongs to the post

I've written a program that fetches the desired information from a blog or any page. The next thing, I want to achieve is to retrieve the first image from that page, that belongs to the respective post (Just like Facebook does when a post is shared).
I was able to achieve this to some extent by fetching the first image with an alt tag (since many websites don't have alt tags in their logos and icons etc, the first one should belong to the post). But this does not seem to work in some cases. Is there any other (better) way to achieve this?
I'm using python 2.7.9 and BeautifulSoup 4.
d = feedparser.parse('http://rss.cnn.com/rss/edition.rss')
for entry in d.entries:
try:
if entry.title is not None:
print entry.title
print ""
except Exception, e:
print e
try:
if entry.link is not None:
print entry.link
print ""
except Exception, e:
print e
try:
if entry.published[5:16] is not None:
print entry.published[5:16]
print ""
except Exception, e:
print e
try:
if entry.category is not None:
print entry.category
print ""
except Exception, e:
print e
try:
if entry.get('summary', '') is not None:
print entry.get('summary', '')
print ""
except Exception, e:
print e
time.sleep(5)
r = requests.get(entry.link, headers = {'User-Agent' : 'Safari/534.55.3 '})
soup = BeautifulSoup(r.text, 'html.parser')
for img in soup.findAll('img'):
if img.has_attr('alt'):
if img['src'].endswith('.jpg') == True or img['src'].endswith('.png') == True:
print img['src']
break
It is probably more practical to take a look at the opengraph module:
https://pypi.python.org/pypi/opengraph/0.5
and correct it the way you like.
It will fetch "first image" from HTML code or use og:image.
If you want to learn, you can also do it by looking at the source code. The module uses BeautifulSoup too.
I needed the following monkeypatch to activate scraping as fallback:
import re
from bs4 import BeautifulSoup
from opengraph import OpenGraph
def parser(self, html):
"""
"""
if not isinstance(html,BeautifulSoup):
doc = BeautifulSoup(html, from_encoding='utf-8')
else:
doc = html
ogs = doc.html.head.findAll(property=re.compile(r'^og'))
for og in ogs:
self[og[u'property'][3:]]=og[u'content']
# Couldn't fetch all attrs from og tags, try scraping body
if not self.is_valid() and self.scrape:
for attr in self.required_attrs:
if not hasattr(self, attr):
try:
self[attr] = getattr(self, 'scrape_%s' % attr)(doc)
except AttributeError:
pass
OpenGraph.parser = parser
OpenGraph.scrape = True # workaround for some subtle bug in opengraph
You may need to handle relatives URLs in the image sources, but it is quite straightforward with use of urljoin from urlparse
import opengraph
...
page = opengraph.OpenGraph(url=link, scrape=True)
...
if page.is_valid():
...
image_url = page.get('image', None)
...
if not image_url.startswith('http'):
image_url = urljoin(page['_url'], page['image'])
(some check are omitted for brevity from the code fragment)

Categories

Resources