I'm learning webscraping and working on Eat24 (Yelp's website). I'm able to scrape basic data from Yelp, but unable to do something pretty simple: append that data to a dataframe. Here is my code, I've notated it so it should be simple to follow along.
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
#go to eat24, type in zip code 10007, choose pickup and click search
driver.get("https://new-york.eat24hours.com/restaurants/index.php")
search_area = driver.find_element_by_name("address_auto_complete")
search_area.send_keys("10007")
pickup_element = driver.find_element_by_xpath("//[#id='search_form']/div/table/tbody/tr/td[2]")
pickup_element.click()
search_button = driver.find_element_by_xpath("//*[#id='search_form']/div/table/tbody/tr/td[3]/button")
search_button.click()
#scroll up and down on page to load more of 'infinity' list
for i in range(0,3):
driver.execute_script("window.scrollTo(0,
document.body.scrollHeight);")
driver.execute_script("window.scrollTo(0,0);")
time.sleep(1)
#find menu urls
menu_urls = [page.get_attribute('href') for page in
driver.find_elements_by_xpath('//*[#title="View Menu"]')]
df = pd.DataFrame(columns=['name', 'menuitems'])
#collect menu items/prices/name from each URL
for url in menu_urls:
driver.get(url)
menu_items = driver.find_elements_by_class_name("cpa")
menu_items = [x.text for x in menu_items]
menu_prices = driver.find_elements_by_class_name('item_price')
menu_prices = [x.text for x in menu_prices]
name = driver.find_element_by_id('restaurant_name')
menuitems = dict(zip(menu_items, menu_prices))
df['name'] = name
df['menuitems'] = menuitems
df.to_csv('test.csv', index=False)
The problem is at the end. It isn't adding menuitems + name into successive rows in the dataframe. I have tried using .loc and other functions but it got messy so I removed my attempts. Any help would be appreciated!!
Edit: The error I get is "ValueError: Length of values does not match length of index" when the for loop attempts to add a second set of menuitems/restaurant name to the dataframe
I figured out a simple solution, not sure why I didn't think of it before. I added a "row" count that goes up by 1 on each iteration, and used .loc to place data in the "row"th row
row = 0
for url in menu_urls:
row +=1
driver.get(url)
menu_items = driver.find_elements_by_class_name("cpa")
menu_items = [x.text for x in menu_items]
menu_prices = driver.find_elements_by_class_name('item_price')
menu_prices = [x.text for x in menu_prices]
name = driver.find_element_by_id('restaurant_name').text
menuitems = [dict(zip(menu_items, menu_prices))]
df.loc[row, 'name'] = name
df.loc[row, 'menuitems'] = menuitems
print df
Related
My scraper is calling the website and hitting each of the 44 pages and creating a csv file but the csv file is empty. I am returning after each of the functions and saving the data to a csv at the end of the scraper.
Can anyone see what is wrong with my code?
Code:
import pandas,requests,bs4,time
from seleniumwire import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import datetime
TODAY = datetime.datetime.today().strftime("%Y%m%d")
SAVE_FILENAME = "/Users/180284/jupyter-1.0.0/pssi_jobs-"+TODAY+".csv"
driver = webdriver.Chrome('~/Desktop/chromedriver_mac64')
driver.implicitly_wait(30)
URL_BASE = "https://jobs.pssi.com/us/en/search-resultskeywords=%22food%20safety%20team%20member%22&s=1"
MAX_PAGE = 44
HEADERS = {
'From': 'myemail'
}
def interceptor(request):
del request.headers['From']
request.headers['From'] = HEADERS["From"]
driver.request_interceptor = interceptor
def parse_job_post_div(div_html):
soup = bs4.BeautifulSoup(div_html)
job_ls = soup.findAll("div",{"class":"information"})
job_data = []
for job in job_ls:
job_listing = job.find("div",{"class":"information"}).get_text(separator=", ").strip()
title = job.find("span",{"role":"heading"}).get_text(separator=", ").strip()
job_location = job.find("p",{"class":"job-info"}).get_text(separator=", ").strip()
new_row = {"job_listing":job,"title":title,"job_location":job_location}
job_data.append(new_row)
return job_data
def get_data(wd):
job_postings = driver.find_element(By.CLASS_NAME, "information")
html = job_postings.get_attribute("innerHTML")
parsed = parse_job_post_div(html)
return pandas.DataFrame(parsed)
def process_page(url):
driver.get(url)
master_data = []
i = 0
while True:
df = get_data(driver)
master_data.append(df)
if i == (MAX_PAGE - 1):
break
driver.find_element(By.XPATH, "//span[#class='icon icon-arrow-right']").click()
time.sleep(10)
print(i)
i+=1
return pandas.concat(master_data,ignore_index=True)
data = process_page(URL_BASE)
data.to_csv(SAVE_FILENAME)
`
I have tried the above code.
The first problem I found in your code is that the job_ls is an empty list, i.e. soup.findAll("div",{"class":"information"}) doesn't find anything.
Moreover, job_postings contains only one webelement (i.e. the first job of the list) instead of all 10 jobs shown in the page, that's because you used .find_element instead of .find_elements. As a result of these and other problems, process_page(URL_BASE) returns an empty dataframe.
In this case you can speed up the process and use less code using directly selenium instead of bs4
driver.get(URL_BASE)
driver.implicitly_wait(30)
MAX_PAGE = 4
titles, locations, descriptions = [], [], []
for i in range(MAX_PAGE):
print('current page:',i+1,end='\r')
titles += [title.text for title in driver.find_elements(By.CSS_SELECTOR, '.information > span[role=heading]')]
locations += [loc.text.replace('\n',', ') for loc in driver.find_elements(By.CSS_SELECTOR, '.information > p[class=job-info]')]
descriptions += [title.text for title in driver.find_elements(By.CSS_SELECTOR, '.information > p[data-ph-at-id=jobdescription-text')]
if i < MAX_PAGE-1:
driver.find_element(By.XPATH, "//span[#class='icon icon-arrow-right']").click()
else:
break
df = pandas.DataFrame({'title':titles,'location':locations,'description':descriptions})
df.to_csv(SAVE_FILENAME, index=False)
and df will be something like
I am trying to extract the balance sheet for an example ticker "MSFT" (Microsoft) from Yahoo Finance.
Using Selenium to click on the button "Expand All" before any scraping is done. This part seems to work.
By the way, when the Chrome web driver is launched, I manually click on the button(s) to accept or reject cookies. In a later step, I plan to add some more code so that this part is also automated. My question is though not on this one now.
Below is how the code currently looks like.
# for scraping the balance sheet from Yahoo Finance
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
# importing selenium to click on the "Expand All" button before scraping the financial statements
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def get_balance_sheet_from_yfinance(ticker):
url = f"https://finance.yahoo.com/quote/{ticker}/balance-sheet?p={ticker}"
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
WebDriverWait(driver, 3600).until(EC.element_to_be_clickable((
By.XPATH, "//section[#data-test='qsp-financial']//span[text()='Expand All']"))).click()
#content whole page in html format
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the column headers (i.e. 'Breakdown' row)
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
# get the list of columns from the column headers
col = []
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
df = pd.DataFrame(columns=col)
# the following code returns an empty list for index (why?)
# and values in a list that need actually be in a DataFrame
idx = []
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
for h in div.find_all('title'):
text = h.get_text()
idx.append(text)
val = []
for div in soup.find_all('div', attrs={'data-test': 'fin-col'}):
for h in div.find_all('span'):
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
# if the above part is commented out and this block is used instead
# the following code manages to work well until the row "Cash Equivalents"
# that is because there are no entries for years 2020 and 2019 on this row
""" for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
i = 0
idx = ""
val = []
for h in div.find_all('span'):
if i % 5 == 0:
idx = h.get_text()
else:
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
i += 1
row = pd.DataFrame([val], columns=col, index=[idx])
df = pd.concat([df, row], axis=0) """
return idx, val
get_balance_sheet_from_yfinance("MSFT")
I could not get the data scraped from the expanded table in a usable tabular format. Instead, the function above returns what I managed to scrape from the webpage. There are some additional comments in the code.
Could you give me some ideas on how to properly extract the data and put it into a DataFrame object with index which should be the text under the "Breakdown" column? Basically, the DataFrame should look like the snapshot below, with what is under the first column in there being the index.
balance-sheet-df
i've spent a long time on this, hope it helps, basically your function now returns a dataFrame with the following formatting:
2022-06-29 2021-06-29 2020-06-29 2019-06-29
Total Assets 364,840,000 333,779,000 301,311,000 286,556,000
Current Assets 169,684,000 184,406,000 181,915,000 175,552,000
Cash, Cash Equivalents & Short Term Investments 104,749,000 130,334,000 136,527,000 133,819,000
Cash And Cash Equivalents 13,931,000 14,224,000 13,576,000 11,356,000
Cash 8,258,000 7,272,000 - -
... ... ... ... ...
Tangible Book Value 87,720,000 84,477,000 67,915,000 52,554,000
Total Debt 61,270,000 67,775,000 70,998,000 78,366,000
Net Debt 35,850,000 43,922,000 49,751,000 60,822,000
Share Issued 7,464,000 7,519,000 7,571,000 7,643,000
Ordinary Shares Number 7,464,000 7,519,000 7,571,000 7,643,000
and here's the final code:
# for scraping the balance sheet from Yahoo Finance
from time import sleep
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
# importing selenium to click on the "Expand All" button before scraping the financial statements
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def get_balance_sheet_from_yfinance(ticker):
url = f"https://finance.yahoo.com/quote/{ticker}/balance-sheet?p={ticker}"
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
WebDriverWait(driver, 3600).until(EC.element_to_be_clickable((
By.XPATH, "//section[#data-test='qsp-financial']//span[text()='Expand All']"))).click()
# content whole page in html format
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the column headers (i.e. 'Breakdown' row)
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
# get the list of columns from the column headers
col = []
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
row = {}
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
head = div.find('span').get_text()
i = 4
for h in div.find_all('span'):
if h.get_text().replace(',', '').isdigit() or h.get_text()[0] == '-':
row[head].append(h.get_text())
i += 1
else:
while i < 4:
row[head].append('')
i += 1
else:
head = h.get_text()
row[head] = []
i = 0
for k, v in row.items():
while len(v) < 4:
row[k].append('-')
df = pd.DataFrame(columns=col, index=row.keys(), data=row.values())
print(df)
return df
get_balance_sheet_from_yfinance("MSFT")
i've removed some od the unused code and added a new scrapping method, but i have kept your method of getting the dates of all the columns.
if you have any questions don't hesitate to ask in the comments.
it's my first time posting here, so please let me know if I've messed anything up. I'm having some trouble with a nested loop in selenium. I'm trying to iterate through a list of players, gather stats for each one, and add them to a dataframe. Right now each player in the list gets entered into the search bar and their page is displayed, but stats are only collected for the last player in the list.
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
url = "https://www.sports-reference.com/cfb/"
driver = webdriver.Chrome(path)
driver.get(url)
dataframe1 = []
with open('A.txt') as f:
players = f.readlines()
for player in players:
search = driver.find_element(By.NAME, "search")
search.send_keys(player)
button = driver.find_element(By.XPATH, '//*[#id="header"]/div[3]/form/input[1]')
button.click()
stats = driver.find_elements(By.XPATH, '//*[#id="passing"]/tfoot/tr')
for stat in stats:
comps = stat.find_element(By.XPATH, '//*[#id="passing"]/tfoot/tr/td[6]').text
data = {
'Player': player,
'Completions': comps,
}
dataframe1.append(data)
df = pd.DataFrame(dataframe1)
print(df)
driver.close()
You have wrong indentation for the lines where you initialize the data dict, and append to dataframe1. they must be at the same level as the block under the innermost for loop.
I modified your code, here:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
url = "https://www.sports-reference.com/cfb/"
driver = webdriver.Chrome(path)
driver.get(url)
dataframe1 = []
with open('A.txt') as f:
players = f.readlines()
for player in players:
search = driver.find_element(By.NAME, "search")
search.send_keys(player)
button = driver.find_element(By.XPATH, '//*[#id="header"]/div[3]/form/input[1]')
button.click()
stats = driver.find_elements(By.XPATH, '//*[#id="passing"]/tfoot/tr')
for stat in stats:
comps = stat.find_element(By.XPATH, '//*[#id="passing"]/tfoot/tr/td[6]').text
data = {
'Player': player,
'Completions': comps,
}
dataframe1.append(data)
df = pd.DataFrame(dataframe1)
print(df)
driver.close()
Thank you everyone for your assistance with this issue. I eventually found out that I did not need the 'button' variable or button.click() in the script. Send keys was already hitting "return" after the string was passed to the search parameter on the page, so basically return was getting hit twice, once on the players name, and once on an empty search parameter. The default page that was returned when searching for the empty parameter did not contain the element I was attempting to find, which resulted in an empty list. Again thank you for your help with this issue.
I am trying to iterate through player seasons on NBA.com and pull shooting statistics after each click of the season dropdown menu. After each click, I get the error message "list index out of range" for:
headers = table[1].findAll('th')
It seems to me that the page doesn't load all the way before the source data is saved.
Looking at other similar questions, I have tried using an browser.implicitly_wait() for each loop, but I am still getting the same error. It also doesn't seem that the browser waits after more than the first iteration of the loop.
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import pandas as pd
player_id = str(1629216)
url = 'https://www.nba.com/stats/player/' + player_id + "/shooting/"
browser = Chrome(executable_path='/usr/local/bin/chromedriver')
browser.get(url)
select = Select(browser.find_element_by_xpath('/html/body/main/div/div/div/div[4]/div/div/div/div/div[1]/div[1]/div/div/label/select'))
options = select.options
for index in range(0, len(options)):
select.select_by_index(index)
browser.implicitly_wait(5)
src = browser.page_source
parser = BeautifulSoup(src, "lxml")
table = parser.findAll("div", attrs = {"class":"nba-stat-table__overflow"})
headers = table[1].findAll('th')
headerlist = [h.text.strip() for h in headers[1:]]
headerlist = [a for a in headerlist if not '\n' in a]
headerlist.append('AST%')
headerlist.append('UAST%')
row_labels = table[1].findAll("td", {"class": "first"})
row_labels_list = [r.text.strip() for r in row_labels[0:]]
rows = table[1].findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
df = pd.DataFrame(data=player_stats, columns=headerlist, index = row_labels_list)
print(df)
I found my own answer. I used time.sleep(1) at the top of the loop to give the browser a second to load all the way. Without this delay, the pages source code did not have the appropriate table that I am scraping.
Responding to those who answered - I did not want to go the api route, but I have seen people scrape nba.com using that method. Table[1] is the correct table; just needed the source code a chance to load after the I loop through the season dropdown.
select.select_by_index(index)
time.sleep(1)
src = browser.page_source
parser = BeautifulSoup(src, "lxml")
table = parser.findAll("div", attrs = {"class":"nba-stat-table__overflow"})
headers = table[1].findAll('th')
headerlist = [h.text.strip() for h in headers[1:]]
headerlist = [a for a in headerlist if not '\n' in a]
headerlist.append('AST%')
headerlist.append('UAST%')
row_labels = table[1].findAll("td", {"class": "first"})
row_labels_list = [r.text.strip() for r in row_labels[0:]]
rows = table[1].findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
df = pd.DataFrame(data=player_stats, columns=headerlist, index = row_labels_list)
print(df)
I am analyzing the balance sheet of Amazon on Yahoo Finance. It contains nested rows, and I cannot extract all of them. The sheet looks like this:
I used BeautifulSoup4 and the Selenium web driver to get me the following output:
The following is the code:
import pandas as pd
from bs4 import BeautifulSoup
import re
from selenium import webdriver
import string
import time
# chart display specifications w/ Panda
pd.options.display.float_format = '{:.0f}'.format
pd.set_option('display.width', None)
is_link = 'https://finance.yahoo.com/quote/AMZN/balance-sheet/'
chrome_path = r"C:\\Users\\hecto\\Documents\\python\\drivers\\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get(is_link)
html = driver.execute_script('return document.body.innerHTML;')
soup = BeautifulSoup(html,'lxml')
features = soup.find_all('div', class_='D(tbr)')
headers = []
temp_list = []
label_list = []
final = []
index = 0
#create headers
for item in features[0].find_all('div', class_='D(ib)'):
headers.append(item.text)
#statement contents
while index <= len(features)-1:
#filter for each line of the statement
temp = features[index].find_all('div', class_='D(tbc)')
for line in temp:
#each item adding to a temporary list
temp_list.append(line.text)
#temp_list added to final list
final.append(temp_list)
#clear temp_list
temp_list = []
index+=1
df = pd.DataFrame(final[1:])
df.columns = headers
#function to make all values numerical
def convert_to_numeric(column):
first_col = [i.replace(',','') for i in column]
second_col = [i.replace('-','') for i in first_col]
final_col = pd.to_numeric(second_col)
return final_col
for column in headers[1:]:
df[column] = convert_to_numeric(df[column])
final_df = df.fillna('-')
print(df)
Again, I cannot seem to get all the rows of the balance sheet on my output (i.e. Cash, Total Current Assets). Where did I go wrong? Am I missing something?
You may have to click the "Expand All" button to see the additional rows. Refer to this thread to see how to simulate the click in Selenium: python selenium click on button