Web Scraping Using Selenium to get date based data - python

I'm very much new to Web Scraping, I really emphasize on new.
I need to scrape data from a table on a website. That table changes every day (stock prices). until now my code extracts the data for one single day, but I need it to do for multiple days at once. The web page has a calendar, you can choose a day and it shows you its history.
I'm using selenium.
Here's part of my code to show you what I'm doing`
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_path = "C:\Program Files (x86)\chromedriver.exe"
chrome_options = Options()
chrome_options.add_argument("headless")
driver = webdriver.Chrome(chrome_path , options = chrome_options , keep_alive = False)
driver.get("http://www.casablanca-bourse.com/bourseweb/indice-ponderation.aspx?Cat=22&IdLink=298")
codelist = []
instrumentList = []
NbreList = []
CoursList = []
FacteurList = []
FacteurPlafList = []
Capitalist = []
poidList = []
for i in range(4,77):
codepath = f"""//*[#id="Ponderation1_UpdatePanel1"]/table/tbody/tr[5]/td/table/tbody/tr[4]/td[2]"""
code = driver.find_element_by_xpath(codepath)
codelist.append(code.text)

Change the date and click the button
driver.find_element_by_id("Ponderation1_DateTimeControl1_TBCalendar").clear()
driver.find_element_by_id("Ponderation1_DateTimeControl1_TBCalendar").send_keys("19/08/2020")
driver.find_element_by_id("Ponderation1_ImageButton1").click()
for i in range(4,77):
codepath = f"""//*[#id="Ponderation1_UpdatePanel1"]/table/tbody/tr[5]/td/table/tbody/tr[{i}]/td[2]"""
code = driver.find_element_by_xpath(codepath)
codelist.append(code.text)
print(codelist)
Also put {i} for the tr

As you have defined one list for each column. I assume you want to store data of each column in a separate list and you want to load table based on date. You can define below function and then call your function to get data for each column.
def scraping_table (date, columnNumber):
colList =[]
colXpath = "//tr[td[text()='Code Isin']]//following-sibling::tr//td["+str(columnNumber)+"]"
#Enter date in date picker
datePicker = driver.find_element_by_name("Ponderation1$DateTimeControl1$TBCalendar")
datePicker.clear()
datePicker.send_keys(date)
driver.find_element_by_name("Ponderation1$ImageButton1").click()
time.sleep(6) #Wait for table to laod
data = driver.find_elements_by_xpath(colXpath)
if len(data) >2: # If table is empty for a date no record will be returned
for i in range (2, len(data)-1):
colList.append(data[i].text)
return colList
chrome_path = '..\drivers\chromedriver'
chrome_options = Options()
chrome_options.add_argument("headless")
driver = webdriver.Chrome(chrome_path , options = chrome_options , keep_alive = False)
driver.get("http://www.casablanca-bourse.com/bourseweb/indice-ponderation.aspx?Cat=22&IdLink=298")
# Call function Now, Can pass date and column as per your need
codelist = scraping_table('17/08/2020', 2) # Note your table has hidden columns and Code Isin is column number 2
instrumentList = scraping_table('17/08/2020', 3)
NbreList = scraping_table('17/08/2020', 4)
CoursList = scraping_table('17/08/2020', 5)
FacteurList = scraping_table('17/08/2020', 6)
FacteurPlafList = scraping_table('17/08/2020', 7)
Capitalist = scraping_table('17/08/2020', 8)
poidList = scraping_table('17/08/2020', 9)
# To illustrate i have printed values of 'Nombre de titres' column
for num in NbreList:
print(num)

Related

How to extract data in the right order with Beautiful Soup

I am trying to extract the balance sheet for an example ticker "MSFT" (Microsoft) from Yahoo Finance.
Using Selenium to click on the button "Expand All" before any scraping is done. This part seems to work.
By the way, when the Chrome web driver is launched, I manually click on the button(s) to accept or reject cookies. In a later step, I plan to add some more code so that this part is also automated. My question is though not on this one now.
Below is how the code currently looks like.
# for scraping the balance sheet from Yahoo Finance
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
# importing selenium to click on the "Expand All" button before scraping the financial statements
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def get_balance_sheet_from_yfinance(ticker):
url = f"https://finance.yahoo.com/quote/{ticker}/balance-sheet?p={ticker}"
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
WebDriverWait(driver, 3600).until(EC.element_to_be_clickable((
By.XPATH, "//section[#data-test='qsp-financial']//span[text()='Expand All']"))).click()
#content whole page in html format
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the column headers (i.e. 'Breakdown' row)
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
# get the list of columns from the column headers
col = []
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
df = pd.DataFrame(columns=col)
# the following code returns an empty list for index (why?)
# and values in a list that need actually be in a DataFrame
idx = []
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
for h in div.find_all('title'):
text = h.get_text()
idx.append(text)
val = []
for div in soup.find_all('div', attrs={'data-test': 'fin-col'}):
for h in div.find_all('span'):
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
# if the above part is commented out and this block is used instead
# the following code manages to work well until the row "Cash Equivalents"
# that is because there are no entries for years 2020 and 2019 on this row
""" for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
i = 0
idx = ""
val = []
for h in div.find_all('span'):
if i % 5 == 0:
idx = h.get_text()
else:
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
i += 1
row = pd.DataFrame([val], columns=col, index=[idx])
df = pd.concat([df, row], axis=0) """
return idx, val
get_balance_sheet_from_yfinance("MSFT")
I could not get the data scraped from the expanded table in a usable tabular format. Instead, the function above returns what I managed to scrape from the webpage. There are some additional comments in the code.
Could you give me some ideas on how to properly extract the data and put it into a DataFrame object with index which should be the text under the "Breakdown" column? Basically, the DataFrame should look like the snapshot below, with what is under the first column in there being the index.
balance-sheet-df
i've spent a long time on this, hope it helps, basically your function now returns a dataFrame with the following formatting:
2022-06-29 2021-06-29 2020-06-29 2019-06-29
Total Assets 364,840,000 333,779,000 301,311,000 286,556,000
Current Assets 169,684,000 184,406,000 181,915,000 175,552,000
Cash, Cash Equivalents & Short Term Investments 104,749,000 130,334,000 136,527,000 133,819,000
Cash And Cash Equivalents 13,931,000 14,224,000 13,576,000 11,356,000
Cash 8,258,000 7,272,000 - -
... ... ... ... ...
Tangible Book Value 87,720,000 84,477,000 67,915,000 52,554,000
Total Debt 61,270,000 67,775,000 70,998,000 78,366,000
Net Debt 35,850,000 43,922,000 49,751,000 60,822,000
Share Issued 7,464,000 7,519,000 7,571,000 7,643,000
Ordinary Shares Number 7,464,000 7,519,000 7,571,000 7,643,000
and here's the final code:
# for scraping the balance sheet from Yahoo Finance
from time import sleep
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
# importing selenium to click on the "Expand All" button before scraping the financial statements
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def get_balance_sheet_from_yfinance(ticker):
url = f"https://finance.yahoo.com/quote/{ticker}/balance-sheet?p={ticker}"
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
WebDriverWait(driver, 3600).until(EC.element_to_be_clickable((
By.XPATH, "//section[#data-test='qsp-financial']//span[text()='Expand All']"))).click()
# content whole page in html format
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the column headers (i.e. 'Breakdown' row)
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
# get the list of columns from the column headers
col = []
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
row = {}
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
head = div.find('span').get_text()
i = 4
for h in div.find_all('span'):
if h.get_text().replace(',', '').isdigit() or h.get_text()[0] == '-':
row[head].append(h.get_text())
i += 1
else:
while i < 4:
row[head].append('')
i += 1
else:
head = h.get_text()
row[head] = []
i = 0
for k, v in row.items():
while len(v) < 4:
row[k].append('-')
df = pd.DataFrame(columns=col, index=row.keys(), data=row.values())
print(df)
return df
get_balance_sheet_from_yfinance("MSFT")
i've removed some od the unused code and added a new scrapping method, but i have kept your method of getting the dates of all the columns.
if you have any questions don't hesitate to ask in the comments.

Looping and stop duplicating output | Selenium | Python

Very new to Python and Selenium, looking to scrape a few data points. I'm struggling in three areas:
I don't understand how to loop through multiple URLs properly
I can't figure out why the script is iterating twice over each URL
I can't figure out why it's only outputting the data for the second URL
Much thanks for taking a look!
Here's my current script:
urls = [
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.crutchfield.com/%2F&tab=mobile',
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.lastpass.com%2F&tab=mobile'
]
driver = webdriver.Chrome(executable_path='/Library/Frameworks/Python.framework/Versions/3.9/bin/chromedriver')
for url in urls:
for page in range(0, 1):
driver.get(url)
wait = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.CLASS_NAME, 'origin-field-data')))
df = pd.DataFrame(columns = ['Title', 'Core Web Vitals', 'FCP', 'FID', 'CLS', 'TTI', 'TBT', 'Total Score'])
company = driver.find_elements_by_class_name("audited-url__link")
data = []
for i in company:
data.append(i.get_attribute('href'))
for x in data:
#Get URL name
title = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[1]/div/div[2]/h1/a')
co_name = title.text
#Get Core Web Vitals text pass/fail
cwv = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[1]/span[2]')
core_web = cwv.text
#Get FCP
fcp = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[1]/div')
first_content = fcp.text
#Get FID
fid = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[3]/div[1]/div')
first_input = fid.text
#Get CLS
cls = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[4]/div[1]/div')
layout_shift = cls.text
#Get TTI
tti = driver.find_element_by_xpath('//*[#id="interactive"]/div/div[1]')
time_interactive = tti.text
#Get TBT
tbt = driver.find_element_by_xpath('//*[#id="total-blocking-time"]/div/div[1]')
total_block = tbt.text
#Get Total Score
total_score = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[1]/div/div[1]/a/div[2]')
score = total_score.text
#Adding all columns to dataframe
df.loc[len(df)] = [co_name,core_web,first_content,first_input,layout_shift,time_interactive,total_block,score]
driver.close()
#df.to_csv('Double Page Speed Test 9-10.csv')
print(df)
Q1 : I don't understand how to loop through multiple URLs properly ?
Ans : for url in urls:
Q2. I can't figure out why the script is iterating twice over each URL
Ans : Cause you have for page in range(0, 1):
Update 1:
I did not run your entire code with DF. Also sometimes either one of the pages, does not show the number and href, but when I typically run the below code,
driver = webdriver.Chrome(driver_path)
driver.maximize_window()
driver.implicitly_wait(50)
wait = WebDriverWait(driver, 20)
urls = [
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.crutchfield.com/%2F&tab=mobile',
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.lastpass.com%2F&tab=mobile'
]
data = []
for url in urls:
driver.get(url)
wait = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.CLASS_NAME, 'origin-field-data')))
company = driver.find_elements_by_css_selector("h1.audited-url a")
for i in company:
data.append(i.get_attribute('href'))
print(data)
this output :
['https://www.crutchfield.com//', 'https://www.lastpass.com/', 'https://www.lastpass.com/']
which is true case the element locator that we have used is representing 1 element on page 1 or 2 element on page 2

Open Webpage with Selenium and Check If Variable Is in Body

I am using webdriver selenium driver to open the url in the for loop. Once the URL opens, it stores file_total_l.append(get_file_total) in a list. How can I check to make sure variable 'missing_amount' is in the webpage URL before appending get_file_total to list file_total_l?
Whats happening:
Its inserting file_total into my database a second time into my table, if I run the script twice. The missing_amount is 165,757,06 from my table so why isnt that being inserted.
print(icl_dollar_amount):
['627,418.07', '6,986,500.57', '165,757.06']
print(missing_amount[i])
'165,757.06'
code:
missing_amount = []
for rps_amount2 in rps_amount_l:
if rps_amount2 not in bpo_file_total_l:
rps_table_q_2 = f"""select * from rps..sendfile where processingdate = '{cd}' and datasetname like '%ICL%' and paymenttotamt = '{rps_amount2}' """
rps_table_results = sql_server_cursor.execute(rps_table_q_2).fetchall()
file_missing = True
for rps in rps_table_results:
rps_amount_f = str(rps[18]).rstrip('0')
rps_amount_f = ("{:,}".format(float(rps_amount_f)))
missing_amount.append(rps_amount_f)
file_total_l
for link in url_list:
print(link)
options = Options()
browser = webdriver.Chrome(options=options,
executable_path=r'\\test\user$\test\Documents\driver\chromedriver.exe')
browser.get(link)
body = browser.find_element_by_xpath("//*[contains(text(), 'Total:')]").text
body_l.append(body)
icl_dollar_amount = re.findall('(?:[\£\$\€]{1}[,\d]+.?\d*)', body)[0].split('$', 1)[1]
icl_dollar_amount_l.append(icl_dollar_amount)
if not missing_amount:
logging.info("List is empty")
print("List is empty")
count = 0
for i in range(len(missing_amount)):
if missing_amount[i] in icl_dollar_amount_l:
body = body_l[i]
get_file_total = re.findall('(?:[\£\$\€]{1}[,\d]+.?\d*)', body)[0].split('$', 1)[1]
file_total_l.append(get_file_total)

Python BeautifulSoup4 Parsing: Hidden html elements on Yahoo Finance

I am analyzing the balance sheet of Amazon on Yahoo Finance. It contains nested rows, and I cannot extract all of them. The sheet looks like this:
I used BeautifulSoup4 and the Selenium web driver to get me the following output:
The following is the code:
import pandas as pd
from bs4 import BeautifulSoup
import re
from selenium import webdriver
import string
import time
# chart display specifications w/ Panda
pd.options.display.float_format = '{:.0f}'.format
pd.set_option('display.width', None)
is_link = 'https://finance.yahoo.com/quote/AMZN/balance-sheet/'
chrome_path = r"C:\\Users\\hecto\\Documents\\python\\drivers\\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get(is_link)
html = driver.execute_script('return document.body.innerHTML;')
soup = BeautifulSoup(html,'lxml')
features = soup.find_all('div', class_='D(tbr)')
headers = []
temp_list = []
label_list = []
final = []
index = 0
#create headers
for item in features[0].find_all('div', class_='D(ib)'):
headers.append(item.text)
#statement contents
while index <= len(features)-1:
#filter for each line of the statement
temp = features[index].find_all('div', class_='D(tbc)')
for line in temp:
#each item adding to a temporary list
temp_list.append(line.text)
#temp_list added to final list
final.append(temp_list)
#clear temp_list
temp_list = []
index+=1
df = pd.DataFrame(final[1:])
df.columns = headers
#function to make all values numerical
def convert_to_numeric(column):
first_col = [i.replace(',','') for i in column]
second_col = [i.replace('-','') for i in first_col]
final_col = pd.to_numeric(second_col)
return final_col
for column in headers[1:]:
df[column] = convert_to_numeric(df[column])
final_df = df.fillna('-')
print(df)
Again, I cannot seem to get all the rows of the balance sheet on my output (i.e. Cash, Total Current Assets). Where did I go wrong? Am I missing something?
You may have to click the "Expand All" button to see the additional rows. Refer to this thread to see how to simulate the click in Selenium: python selenium click on button

Appending Scraped Data to Dataframe - Python, Selenium

I'm learning webscraping and working on Eat24 (Yelp's website). I'm able to scrape basic data from Yelp, but unable to do something pretty simple: append that data to a dataframe. Here is my code, I've notated it so it should be simple to follow along.
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
#go to eat24, type in zip code 10007, choose pickup and click search
driver.get("https://new-york.eat24hours.com/restaurants/index.php")
search_area = driver.find_element_by_name("address_auto_complete")
search_area.send_keys("10007")
pickup_element = driver.find_element_by_xpath("//[#id='search_form']/div/table/tbody/tr/td[2]")
pickup_element.click()
search_button = driver.find_element_by_xpath("//*[#id='search_form']/div/table/tbody/tr/td[3]/button")
search_button.click()
#scroll up and down on page to load more of 'infinity' list
for i in range(0,3):
driver.execute_script("window.scrollTo(0,
document.body.scrollHeight);")
driver.execute_script("window.scrollTo(0,0);")
time.sleep(1)
#find menu urls
menu_urls = [page.get_attribute('href') for page in
driver.find_elements_by_xpath('//*[#title="View Menu"]')]
df = pd.DataFrame(columns=['name', 'menuitems'])
#collect menu items/prices/name from each URL
for url in menu_urls:
driver.get(url)
menu_items = driver.find_elements_by_class_name("cpa")
menu_items = [x.text for x in menu_items]
menu_prices = driver.find_elements_by_class_name('item_price')
menu_prices = [x.text for x in menu_prices]
name = driver.find_element_by_id('restaurant_name')
menuitems = dict(zip(menu_items, menu_prices))
df['name'] = name
df['menuitems'] = menuitems
df.to_csv('test.csv', index=False)
The problem is at the end. It isn't adding menuitems + name into successive rows in the dataframe. I have tried using .loc and other functions but it got messy so I removed my attempts. Any help would be appreciated!!
Edit: The error I get is "ValueError: Length of values does not match length of index" when the for loop attempts to add a second set of menuitems/restaurant name to the dataframe
I figured out a simple solution, not sure why I didn't think of it before. I added a "row" count that goes up by 1 on each iteration, and used .loc to place data in the "row"th row
row = 0
for url in menu_urls:
row +=1
driver.get(url)
menu_items = driver.find_elements_by_class_name("cpa")
menu_items = [x.text for x in menu_items]
menu_prices = driver.find_elements_by_class_name('item_price')
menu_prices = [x.text for x in menu_prices]
name = driver.find_element_by_id('restaurant_name').text
menuitems = [dict(zip(menu_items, menu_prices))]
df.loc[row, 'name'] = name
df.loc[row, 'menuitems'] = menuitems
print df

Categories

Resources