I'm trying to pull the hist data from a URL. The date(as epcho time) is part of the URL.
import pandas as pd
import numpy as np
from selenium import webdriver
import chromedriver_binary
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import time
from datetime import datetime
options = Options()
options.headless = True
lastDate = '2021-07-01'
firstDate = '2010-01-01'
time_object = time.strptime(lastDate, '%Y-%m-%d')
period2 = int(time.mktime(time_object))
period1 = int(period2 - 86400*200)
time_object = time.strptime(firstDate, '%Y-%m-%d')
period0 = time.mktime(time_object)
count = 1
url=f"https://finance.yahoo.com/quote/%5EGSPC/history?period1={period1}&period2={period2}&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true"
#url=r'https://finance.yahoo.com/quote/%5EGSPC/history?period1=1262304000&period2=1625097600&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true'
while period2 >= period0:
ed = datetime.fromtimestamp(period2)
sd = datetime.fromtimestamp(period1)
print(f"Working on {sd} {ed}, current count {count}")
print(f"URL is {url}")
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(20)
driver.get(url)
js = "var q=document.documentElement.scrollTop=100000"
driver.execute_script(js)
for row in t.tbody.findAll('tr'):
date = row.findAll('td')[0].text
date = datetime.strptime(date, "%b %d, %Y")
date = date.strftime("%Y-%m-%d")
open = row.findAll('td')[1].text.replace(',','')
high = row.findAll('td')[2].text.replace(',','')
low = row.findAll('td')[3].text.replace(',','')
close = row.findAll('td')[4].text.replace(',','')
adjclose = row.findAll('td')[5].text.replace(',','')
volume = row.findAll('td')[6].text.replace(',','')
hist = pd.DataFrame([[date,open,high,low,close,adjclose,volume]], columns=['Date', 'Open','High','Low','Close', 'Adj Close', 'Volumn'])
if count == 1:
hist.to_csv('hist.csv', index=False, header=True)
else:
hist.to_csv('hist.csv', index=False, mode='a', header=False)
count = count + 1
period2 = int(period1)
period1 = int(period2 - 86400*200)
url=f"https://finance.yahoo.com/quote/%5EGSPC/history?period1={period1}&period2={period2}&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true"
driver.close()
I printed the URL, it updated properly with the newly refreshed period. However, what's being written to my hist.csv is duplicated. It seems the driver only respected my first URL and completely ignored the rest. As a result I got the first period of the dates/price etc repeatedl in my hist.csv
Appreciate if you have any comments.
Thanks
Please disregard - I just realized I didn't refresh the variable while I'm using jupyter. i found the problem just 2 minutes after I posted the questions. Thanks for the great stackoverflow!
Related
I'm trying to scrape the next day's forecast for time, wind speed and wind direction from Weather Underground. I adapted the code in this tutorial and my MWE is
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import date, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
# define future date
start_date = date.today() + pd.Timedelta(days=1)
# get data for Sydney
page = 'https://www.wunderground.com/hourly/au/sydney/date/{}-{}-{}'
df = pd.DataFrame()
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.set_capability("loggingPrefs", {'performance': 'ALL'})
service = ChromeService(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=service, options=options)
classlist = ["mat-cell cdk-cell cdk-column-timeHour mat-column-timeHour ng-star-inserted",
"mat-cell cdk-cell cdk-column-wind mat-column-wind ng-star-inserted",
"mat-cell cdk-cell cdk-column-wind mat-column-wind ng-star-inserted",]
name = ['time', 'windspeed_mph', 'winddirection']
print('gathering data from: ', start_date)
formatted_lookup_URL = page.format(start_date.year,
start_date.month,
start_date.day)
driver.get(formatted_lookup_URL)
rows = WebDriverWait(driver, 20).until( \
EC.visibility_of_all_elements_located((By.XPATH, \
'//td[#class="' + classlist[0] + '"]')))
for row in rows:
time = row.find_element(By.XPATH,'.//span[#class="ng-star-inserted"]').text
# append new row to table
df = df.append(pd.DataFrame({"Day":[str(start_date)],"time":[time],}),
ignore_index = True)
del classlist[0]
for ii in range(len(classlist)):
rows = WebDriverWait(driver, 20).until( \
EC.visibility_of_all_elements_located((By.XPATH, \
'//td[#class="' + classlist[ii] + '"]')))
for row in rows:
if name[ii]=='winddirection':
data = row.find_element(By.XPATH,
'.//span[#class="wu-suffix ng-star-inserted"]').text
print(data)
else:
data = row.find_element(By.XPATH,
'.//span[#class="wu-value wu-value-to"]').text
# append new row to table
df = df.append(pd.DataFrame({name[ii]:[data]}), ignore_index=True)
driver.quit()
# remove NaN
df = df.apply(lambda x: pd.Series(x.dropna().values))
print(df.head())
The final dataframe df contains the time and wind speed, but not the wind direction. I suspect it's because of the line data = row.find_element(By.XPATH, './/span[#class="wu-suffix ng-star-inserted"]').text but I'm not sure how to fix it.
Seems that the classlist and name have different lengths after line del classlist[0]. Fix it by adding this line after deleting first element of classlist:
del name[0]
This is the code that I am working on and I want to the code to iterate through scraping and dates in such a way that it does not overwrite the csv file and the new data is added after scraping through next set of dates. Please help me with it.
I tired different methods such as f.write()
and append as well for the same code but it gives me some or other error.
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from functools import reduce
import pandas as pd
import time
from datetime import datetime, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
import requests
def render_page(url,type):
driver = webdriver.Chrome(executable_path=r'C:\Users\abc\Desktop\screen scraping codes\chromedriver.exe')
driver.get(url)
time.sleep(15)
if type =="C":
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, 'wuSettings'))
)
element.click()
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[#id="wuSettings-quick"]/div/a[2]')))
element.click()
time.sleep(15)
r = driver.page_source
driver.quit()
if type=="F":
r = driver.page_source
driver.quit()
return r
def hourly_scraper(page,dates,type):
output = pd.DataFrame()
for d in dates:
url = str(str(page) + str(d))
r = render_page(url,type)
soup = BS(r, "html.parser",)
container = soup.find('lib-city-history-observation')
check = container.find('tbody')
data = []
data_hour = []
for i in check.find_all('span', class_='ng-star-inserted'):
trial = i.get_text()
data_hour.append(trial)
for i in check.find_all('span', class_='wu-value wu-value-to'):
trial = i.get_text()
data.append(trial)
numbers = pd.DataFrame([data[i:i+7] for i in range(0, len(data), 7)],columns=["Temperature","Dew Point","Humidity","Wind Speed","Wind Gust","Pressure","Precipitation"])
hour = pd.DataFrame(data_hour[0::17],columns=["Time"])
wind = pd.DataFrame(data_hour[7::17],columns=["Wind"])
condition = pd.DataFrame(data_hour[16::17],columns=["Condition"])
dfs = [hour,numbers,wind,condition]
df_final = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), dfs)
df_final['Date'] = str(d)
output = output.append(df_final)
df_final.to_csv(r'C:\Users\abc\Desktop\screen scraping codes\123.csv',index=False)
print(str(str(d) + ' finished!'))
return output
page = "https://www.wunderground.com/history/daily/in/ahmedabad/VAAH/date/"
#dates = ["2020-12-27","2020-12-28"]
d0 = datetime(2009, 1,1)
d1 = datetime(2009, 1,3)
dt = timedelta(days = 1)
dates = np.arange(d0, d1, dt).astype(datetime)
hourly = hourly_scraper(page,dates,"C")
I have an excel file like this.
I am scraping new data from the web for D and E column using this code.
import csv
import time
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
urls =['https://www.linkedin.com/in/felipe-fs',
'https://www.linkedin.com/in/lucascacao',
'https://www.linkedin.com/in/silvia-florido-107a2355',
'https://www.linkedin.com/in/alesillva',
'https://www.linkedin.com/in/marcellogpassos',
'https://www.linkedin.com/in/ana-luiza-fidelis-de-sousa',
'https://www.linkedin.com/in/thiagoanjos',
'https://www.linkedin.com/in/eduardoneves',
'https://www.linkedin.com/in/gabriel-de-santana-weizenmann-73aab7116',
'https://www.linkedin.com/in/felipebluiz']
header_added = False
timestr = time.strftime("%Y%m%d-%H%M%S")
chrome_options = Options()
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_argument("user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data")
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe', options=chrome_options)
driver.maximize_window()
for url in urls:
driver.get(url)
try:
n = False
company = driver.find_element_by_xpath('//ul[#class="pv-top-card--experience-list"]')
if not n:
res = company.text
n = True
html = driver.find_element_by_tag_name('html')
for i in range(2):
html.send_keys(Keys.PAGE_DOWN)
time.sleep(3)
experience = driver.find_elements_by_tag_name('h4')
duration = experience[0].text
with open('test.xlxs', 'a', encoding='utf-8-sig') as f:
w = csv.writer(f)
w.writerow(['', '', '', res, duration])
except:
print("User has an older job")
The data in D and E is only for 4-5 rows, so this can be overwritten. How do I update these columns with the new scraped data? I saw many answers but not sure how to make a data frame of scraped data.I need to read the url from the same file and scrape data and add it to the existing columns. The url list in the code is just a test.
EDIT:- I looked around and decided it's more convenient to simply convert the xlsx to a csv. So I installed xlrd ver 1.2 and did that.
Input: String url was use and newdata of D and C column.
Output: update row.
from openpyxl import load_workbook,Workbook
def update(url,dataDcol,dataEcol):
try:
#get date
wk = load_workbook('oldfile.xlsx')
wh = wk.active
for row in wh['C']:
if row.value == url:
wh['D{}'.format(row.row)] = dataDcol
wh['E{}'.format(row.row)] = dataEcol
break
wk.save('oldfile.xlsx')
wk.close()
except Exception as e:
print('error :' + str(e))
I'm very much new to Web Scraping, I really emphasize on new.
I need to scrape data from a table on a website. That table changes every day (stock prices). until now my code extracts the data for one single day, but I need it to do for multiple days at once. The web page has a calendar, you can choose a day and it shows you its history.
I'm using selenium.
Here's part of my code to show you what I'm doing`
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_path = "C:\Program Files (x86)\chromedriver.exe"
chrome_options = Options()
chrome_options.add_argument("headless")
driver = webdriver.Chrome(chrome_path , options = chrome_options , keep_alive = False)
driver.get("http://www.casablanca-bourse.com/bourseweb/indice-ponderation.aspx?Cat=22&IdLink=298")
codelist = []
instrumentList = []
NbreList = []
CoursList = []
FacteurList = []
FacteurPlafList = []
Capitalist = []
poidList = []
for i in range(4,77):
codepath = f"""//*[#id="Ponderation1_UpdatePanel1"]/table/tbody/tr[5]/td/table/tbody/tr[4]/td[2]"""
code = driver.find_element_by_xpath(codepath)
codelist.append(code.text)
Change the date and click the button
driver.find_element_by_id("Ponderation1_DateTimeControl1_TBCalendar").clear()
driver.find_element_by_id("Ponderation1_DateTimeControl1_TBCalendar").send_keys("19/08/2020")
driver.find_element_by_id("Ponderation1_ImageButton1").click()
for i in range(4,77):
codepath = f"""//*[#id="Ponderation1_UpdatePanel1"]/table/tbody/tr[5]/td/table/tbody/tr[{i}]/td[2]"""
code = driver.find_element_by_xpath(codepath)
codelist.append(code.text)
print(codelist)
Also put {i} for the tr
As you have defined one list for each column. I assume you want to store data of each column in a separate list and you want to load table based on date. You can define below function and then call your function to get data for each column.
def scraping_table (date, columnNumber):
colList =[]
colXpath = "//tr[td[text()='Code Isin']]//following-sibling::tr//td["+str(columnNumber)+"]"
#Enter date in date picker
datePicker = driver.find_element_by_name("Ponderation1$DateTimeControl1$TBCalendar")
datePicker.clear()
datePicker.send_keys(date)
driver.find_element_by_name("Ponderation1$ImageButton1").click()
time.sleep(6) #Wait for table to laod
data = driver.find_elements_by_xpath(colXpath)
if len(data) >2: # If table is empty for a date no record will be returned
for i in range (2, len(data)-1):
colList.append(data[i].text)
return colList
chrome_path = '..\drivers\chromedriver'
chrome_options = Options()
chrome_options.add_argument("headless")
driver = webdriver.Chrome(chrome_path , options = chrome_options , keep_alive = False)
driver.get("http://www.casablanca-bourse.com/bourseweb/indice-ponderation.aspx?Cat=22&IdLink=298")
# Call function Now, Can pass date and column as per your need
codelist = scraping_table('17/08/2020', 2) # Note your table has hidden columns and Code Isin is column number 2
instrumentList = scraping_table('17/08/2020', 3)
NbreList = scraping_table('17/08/2020', 4)
CoursList = scraping_table('17/08/2020', 5)
FacteurList = scraping_table('17/08/2020', 6)
FacteurPlafList = scraping_table('17/08/2020', 7)
Capitalist = scraping_table('17/08/2020', 8)
poidList = scraping_table('17/08/2020', 9)
# To illustrate i have printed values of 'Nombre de titres' column
for num in NbreList:
print(num)
Goal is the use datetime to reiterate over
http://www.harness.org.au/racing/results/?firstDate=01-01-2019
http://www.harness.org.au/racing/results/?firstDate=02-01-2019.... to yesterdays date
(should be done in new_url = base_url + str(enddate1))
then once in that href, i want to circulate over meetingfulllisttable to get name and href to then get results data from each track that day.
My current error is'<=' not supported between instances of 'datetime.timedelta' and 'str' - which comes from my while loop. why is this? never used datetime before
from datetime import datetime, date, timedelta
import requests
import re
from bs4 import BeautifulSoup
base_url = "http://www.harness.org.au/racing/results/?firstDate="
base1_url = "http://www.harness.org.au"
webpage_response = requests.get('http://www.harness.org.au/racing/results/?firstDate=')
soup = BeautifulSoup(webpage_response.content, "html.parser")
format = "%d-%m-%y"
delta = timedelta(days=1)
yesterday = datetime.today() - timedelta(days=1)
yesterday1 = yesterday.strftime(format)
enddate = datetime(2019, 1, 1)
enddate1 = enddate.strftime(format)
while enddate1 <= yesterday1:
enddate1 =+ timedelta(days=1)
new_url = base_url + str(enddate1)
soup12 = requests.get(new_url)
soup1 = BeautifulSoup(soup12.content, "html.parser")
table1 = soup1.find('table', class_='meetingListFull')
for tr in table1.find_all('tr'):
all_cells = tr.find_all('td')
track = all_cells.a.href.get_text()
href = all_cells.get('href')
trackresults = base1_url + href
This
yesterday1 = yesterday.strftime(format)
Is a string. That's why you are getting that error