Python. Updating values to existing columns excel - python

I have an excel file like this.
I am scraping new data from the web for D and E column using this code.
import csv
import time
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
urls =['https://www.linkedin.com/in/felipe-fs',
'https://www.linkedin.com/in/lucascacao',
'https://www.linkedin.com/in/silvia-florido-107a2355',
'https://www.linkedin.com/in/alesillva',
'https://www.linkedin.com/in/marcellogpassos',
'https://www.linkedin.com/in/ana-luiza-fidelis-de-sousa',
'https://www.linkedin.com/in/thiagoanjos',
'https://www.linkedin.com/in/eduardoneves',
'https://www.linkedin.com/in/gabriel-de-santana-weizenmann-73aab7116',
'https://www.linkedin.com/in/felipebluiz']
header_added = False
timestr = time.strftime("%Y%m%d-%H%M%S")
chrome_options = Options()
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_argument("user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data")
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe', options=chrome_options)
driver.maximize_window()
for url in urls:
driver.get(url)
try:
n = False
company = driver.find_element_by_xpath('//ul[#class="pv-top-card--experience-list"]')
if not n:
res = company.text
n = True
html = driver.find_element_by_tag_name('html')
for i in range(2):
html.send_keys(Keys.PAGE_DOWN)
time.sleep(3)
experience = driver.find_elements_by_tag_name('h4')
duration = experience[0].text
with open('test.xlxs', 'a', encoding='utf-8-sig') as f:
w = csv.writer(f)
w.writerow(['', '', '', res, duration])
except:
print("User has an older job")
The data in D and E is only for 4-5 rows, so this can be overwritten. How do I update these columns with the new scraped data? I saw many answers but not sure how to make a data frame of scraped data.I need to read the url from the same file and scrape data and add it to the existing columns. The url list in the code is just a test.
EDIT:- I looked around and decided it's more convenient to simply convert the xlsx to a csv. So I installed xlrd ver 1.2 and did that.

Input: String url was use and newdata of D and C column.
Output: update row.
from openpyxl import load_workbook,Workbook
def update(url,dataDcol,dataEcol):
try:
#get date
wk = load_workbook('oldfile.xlsx')
wh = wk.active
for row in wh['C']:
if row.value == url:
wh['D{}'.format(row.row)] = dataDcol
wh['E{}'.format(row.row)] = dataEcol
break
wk.save('oldfile.xlsx')
wk.close()
except Exception as e:
print('error :' + str(e))

Related

Scraped data is not saving to csv file as it keeps returning a blank csv file

My scraper is calling the website and hitting each of the 44 pages and creating a csv file but the csv file is empty. I am returning after each of the functions and saving the data to a csv at the end of the scraper.
Can anyone see what is wrong with my code?
Code:
import pandas,requests,bs4,time
from seleniumwire import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import datetime
TODAY = datetime.datetime.today().strftime("%Y%m%d")
SAVE_FILENAME = "/Users/180284/jupyter-1.0.0/pssi_jobs-"+TODAY+".csv"
driver = webdriver.Chrome('~/Desktop/chromedriver_mac64')
driver.implicitly_wait(30)
URL_BASE = "https://jobs.pssi.com/us/en/search-resultskeywords=%22food%20safety%20team%20member%22&s=1"
MAX_PAGE = 44
HEADERS = {
'From': 'myemail'
}
def interceptor(request):
del request.headers['From']
request.headers['From'] = HEADERS["From"]
driver.request_interceptor = interceptor
def parse_job_post_div(div_html):
soup = bs4.BeautifulSoup(div_html)
job_ls = soup.findAll("div",{"class":"information"})
job_data = []
for job in job_ls:
job_listing = job.find("div",{"class":"information"}).get_text(separator=", ").strip()
title = job.find("span",{"role":"heading"}).get_text(separator=", ").strip()
job_location = job.find("p",{"class":"job-info"}).get_text(separator=", ").strip()
new_row = {"job_listing":job,"title":title,"job_location":job_location}
job_data.append(new_row)
return job_data
def get_data(wd):
job_postings = driver.find_element(By.CLASS_NAME, "information")
html = job_postings.get_attribute("innerHTML")
parsed = parse_job_post_div(html)
return pandas.DataFrame(parsed)
def process_page(url):
driver.get(url)
master_data = []
i = 0
while True:
df = get_data(driver)
master_data.append(df)
if i == (MAX_PAGE - 1):
break
driver.find_element(By.XPATH, "//span[#class='icon icon-arrow-right']").click()
time.sleep(10)
print(i)
i+=1
return pandas.concat(master_data,ignore_index=True)
data = process_page(URL_BASE)
data.to_csv(SAVE_FILENAME)
`
I have tried the above code.
The first problem I found in your code is that the job_ls is an empty list, i.e. soup.findAll("div",{"class":"information"}) doesn't find anything.
Moreover, job_postings contains only one webelement (i.e. the first job of the list) instead of all 10 jobs shown in the page, that's because you used .find_element instead of .find_elements. As a result of these and other problems, process_page(URL_BASE) returns an empty dataframe.
In this case you can speed up the process and use less code using directly selenium instead of bs4
driver.get(URL_BASE)
driver.implicitly_wait(30)
MAX_PAGE = 4
titles, locations, descriptions = [], [], []
for i in range(MAX_PAGE):
print('current page:',i+1,end='\r')
titles += [title.text for title in driver.find_elements(By.CSS_SELECTOR, '.information > span[role=heading]')]
locations += [loc.text.replace('\n',', ') for loc in driver.find_elements(By.CSS_SELECTOR, '.information > p[class=job-info]')]
descriptions += [title.text for title in driver.find_elements(By.CSS_SELECTOR, '.information > p[data-ph-at-id=jobdescription-text')]
if i < MAX_PAGE-1:
driver.find_element(By.XPATH, "//span[#class='icon icon-arrow-right']").click()
else:
break
df = pandas.DataFrame({'title':titles,'location':locations,'description':descriptions})
df.to_csv(SAVE_FILENAME, index=False)
and df will be something like

How to extract data in the right order with Beautiful Soup

I am trying to extract the balance sheet for an example ticker "MSFT" (Microsoft) from Yahoo Finance.
Using Selenium to click on the button "Expand All" before any scraping is done. This part seems to work.
By the way, when the Chrome web driver is launched, I manually click on the button(s) to accept or reject cookies. In a later step, I plan to add some more code so that this part is also automated. My question is though not on this one now.
Below is how the code currently looks like.
# for scraping the balance sheet from Yahoo Finance
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
# importing selenium to click on the "Expand All" button before scraping the financial statements
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def get_balance_sheet_from_yfinance(ticker):
url = f"https://finance.yahoo.com/quote/{ticker}/balance-sheet?p={ticker}"
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
WebDriverWait(driver, 3600).until(EC.element_to_be_clickable((
By.XPATH, "//section[#data-test='qsp-financial']//span[text()='Expand All']"))).click()
#content whole page in html format
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the column headers (i.e. 'Breakdown' row)
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
# get the list of columns from the column headers
col = []
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
df = pd.DataFrame(columns=col)
# the following code returns an empty list for index (why?)
# and values in a list that need actually be in a DataFrame
idx = []
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
for h in div.find_all('title'):
text = h.get_text()
idx.append(text)
val = []
for div in soup.find_all('div', attrs={'data-test': 'fin-col'}):
for h in div.find_all('span'):
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
# if the above part is commented out and this block is used instead
# the following code manages to work well until the row "Cash Equivalents"
# that is because there are no entries for years 2020 and 2019 on this row
""" for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
i = 0
idx = ""
val = []
for h in div.find_all('span'):
if i % 5 == 0:
idx = h.get_text()
else:
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
i += 1
row = pd.DataFrame([val], columns=col, index=[idx])
df = pd.concat([df, row], axis=0) """
return idx, val
get_balance_sheet_from_yfinance("MSFT")
I could not get the data scraped from the expanded table in a usable tabular format. Instead, the function above returns what I managed to scrape from the webpage. There are some additional comments in the code.
Could you give me some ideas on how to properly extract the data and put it into a DataFrame object with index which should be the text under the "Breakdown" column? Basically, the DataFrame should look like the snapshot below, with what is under the first column in there being the index.
balance-sheet-df
i've spent a long time on this, hope it helps, basically your function now returns a dataFrame with the following formatting:
2022-06-29 2021-06-29 2020-06-29 2019-06-29
Total Assets 364,840,000 333,779,000 301,311,000 286,556,000
Current Assets 169,684,000 184,406,000 181,915,000 175,552,000
Cash, Cash Equivalents & Short Term Investments 104,749,000 130,334,000 136,527,000 133,819,000
Cash And Cash Equivalents 13,931,000 14,224,000 13,576,000 11,356,000
Cash 8,258,000 7,272,000 - -
... ... ... ... ...
Tangible Book Value 87,720,000 84,477,000 67,915,000 52,554,000
Total Debt 61,270,000 67,775,000 70,998,000 78,366,000
Net Debt 35,850,000 43,922,000 49,751,000 60,822,000
Share Issued 7,464,000 7,519,000 7,571,000 7,643,000
Ordinary Shares Number 7,464,000 7,519,000 7,571,000 7,643,000
and here's the final code:
# for scraping the balance sheet from Yahoo Finance
from time import sleep
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
# importing selenium to click on the "Expand All" button before scraping the financial statements
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def get_balance_sheet_from_yfinance(ticker):
url = f"https://finance.yahoo.com/quote/{ticker}/balance-sheet?p={ticker}"
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
WebDriverWait(driver, 3600).until(EC.element_to_be_clickable((
By.XPATH, "//section[#data-test='qsp-financial']//span[text()='Expand All']"))).click()
# content whole page in html format
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the column headers (i.e. 'Breakdown' row)
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
# get the list of columns from the column headers
col = []
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
row = {}
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
head = div.find('span').get_text()
i = 4
for h in div.find_all('span'):
if h.get_text().replace(',', '').isdigit() or h.get_text()[0] == '-':
row[head].append(h.get_text())
i += 1
else:
while i < 4:
row[head].append('')
i += 1
else:
head = h.get_text()
row[head] = []
i = 0
for k, v in row.items():
while len(v) < 4:
row[k].append('-')
df = pd.DataFrame(columns=col, index=row.keys(), data=row.values())
print(df)
return df
get_balance_sheet_from_yfinance("MSFT")
i've removed some od the unused code and added a new scrapping method, but i have kept your method of getting the dates of all the columns.
if you have any questions don't hesitate to ask in the comments.

How can I loop through several pages to download excel files using Selenium and Python

I am trying to build a web scraper that will go through a website's pages and download the excel files from a dropdown menu at the bottom of the page.
The webpages only allow me to download the 50 locations that are displayed on each page and I cannot download all of them at once.
I am able to download the first page's Excel file, but the following pages yield nothing else.
I get the following output after running the code I have provided below.
Skipped a page
No more pages.
If I exclude the lines where it asks to download the pages, it is able to go through each page until the end successfully.
I'll provide an example below for what I am trying to get accomplished.
I would appreciate any help and advice! Thank you!
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
state = 'oklahoma'
rent_to_own = 'rent to own'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx')
industry = driver.find_element(By.ID, "txtKeyword")
industry.send_keys(rent_to_own)
location = driver.find_element(By.ID, "txtLocation")
location.send_keys(state)
driver.find_element(By.ID, "btnSubmit").click()
driver.implicitly_wait(3)
def web_scrape():
more_drawer = driver.find_element(By.XPATH, "//div[#class='more-drawer']//a[#href='/toolkit/jobs/find-businesses.aspx?keyword="+rent_to_own+"&ajax=0&location="+state+"&lang=en&Desfillall=y#Des']")
more_drawer.click()
driver.implicitly_wait(5)
get_50 = Select(driver.find_element(By.ID, 'ViewPerPage'))
get_50.select_by_value('50')
driver.implicitly_wait(5)
filter_description = driver.find_element(By.XPATH, "//ul[#class='filters-list']//a[#href='/toolkit/jobs/find-businesses.aspx?keyword="+rent_to_own+"&ajax=0&location="+state+"&lang=en&Desfillall=y&pagesize=50&currentpage=1&descfilter=Furniture~B~Renting ~F~ Leasing']")
filter_description.click()
while True:
try:
download_excel = Select(driver.find_element(By.ID, 'ResultsDownload'))
download_excel.select_by_value('Excel')
driver.implicitly_wait(20)
first_50 = driver.find_element(By.XPATH, "//div[#id='relatedOccupations']//a[#onclick='hideMoreRelatedOccupations()']")
first_50.click()
driver.implicitly_wait(20)
next_page = driver.find_element(By.XPATH, "//div[#class='pagination-wrap']//div//a[#class='next-page']")
next_page.click()
driver.implicitly_wait(20)
print("Skipped a page.")
except:
print("No more pages.")
return
web_scrape()
Below is something that works. Again I would think the way I went about this could be improved. I stuck with Selenium but you really don't even need to open the webpage and can just webscrape using correct URL params with Beautiful Soup. Also the fastest way was probably not to write every item into excel one at a time but it works, better way is probably using pandas and then creating an excel workbook at the end. But anyway if you have any questions let me know.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import openpyxl as xl
import os
import math
cwd = os.getcwd() #Or whatever dir you want
filename = '\test123.xlsx'
location = 'oklahoma'
keyword = 'rent to own'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx?keyword=' + keyword + '&ajax=0&location=' + location + '&radius=50&pagesize=50&currentpage=1&lang=en')
driver.implicitly_wait(3)
wb = xl.Workbook()
ws = wb.worksheets[0]
#get number of pages
ret = driver.find_element(By.ID, 'recordNumber')
lp = math.ceil(float(ret.text)/50)
r = 1
for i in range(1, lp):
print(i)
driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx?keyword=' + keyword + '&ajax=0&location=' + location + '&radius=50&pagesize=50&currentpage=' + str(i) + '&lang=en')
table_id = driver.find_elements(By.CLASS_NAME, 'res-table')[0]
rows = table_id.find_elements(By.TAG_NAME, "tr")
for count, row in enumerate(rows, start=1):
if count >= 0:
cols = row.find_elements(By.TAG_NAME, "td")
refs = row.find_elements(By.TAG_NAME, "a")
for c, ref in enumerate(refs, start=1):
ws.cell(row=r, column=c).value = '=HYPERLINK("{}", "{}")'.format(ref.get_attribute("href"), ref.text)
for c, col in enumerate(cols, start=1):
if c > 1:
ws.cell(row=r, column=c).value = col.text
r += 1
wb.save(cwd + filename)
print('done')
This returns an excel file with 750+ rows of data with links included.

Can I pause a scroll function in selenium, scrape the current data, and then continue scrolling later in the script?

I am a student working on a scraping project and I am having trouble completing my script because it fills my computer's memory with all of the data is stores.
It currently stores all of my data until the end, so my solution to this would be to break up the scrape into smaller bits and then write out the data periodically so it does not just continue to make one big list and then write out at the end.
In order to do this, I would need to stop my scroll method, scrape the loaded profiles, write out the data that I have collected, and then repeat this process without duplicating my data. It would be appreciated if someone could show me how to do this. Thank you for your help :)
Here's my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
Data = []
driver = webdriver.Chrome()
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
info = Name, IssuedBy, CertificationorDesignaationNumber, CertfiedorDesignatedSince, AccreditedBy, Expires + "\n"
Data.extend(info)
driver.close()
driver.switch_to.window(driver.window_handles[0])
with open("Spredsheet.txt", "w") as output:
output.write(','.join(Data))
driver.close()
Test.py
Displaying Test.py.
Try the below approach using requests and beautifulsoup. In the below script i have used the API URL fetched from website itself for ex:-API URL
First it will create the URL(refer first url) for first iteration, add headers and data in .csv file.
Second iteration it will again create the URL(refer second url) with 2 extra params start_on_page=20 & show_per_page=20 where start_on_page number 20 is incremented by 20 on each iteration and show_per_page = 100 defaulted to extract 100 records per iteration so on till all the data dumped in to the .csv file.second iteration API URL
Script is dumping 4 things number, name, location and profile url.
On each iteration data will be appended to .csv file , so your memory issue will get resolved by this approach.
Do not forget to add your system path in file_path variable where do you want to create .csv file before running the script.
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
import csv
def scrap_directory_data():
list_of_credentials = []
file_path = ''
file_name = 'credential_list.csv'
count = 0
page_number = 0
page_size = 100
create_url = ''
main_url = 'https://directory.bcsp.org/search_results.php?'
first_iteration_url = 'first_name=&last_name=&city=&state=&country=&certification=&unauthorized=0&retired=0&specialties=&industries='
number_of_records = 0
csv_headers = ['#','Name','Location','Profile URL']
while True:
if count == 0:
create_url = main_url + first_iteration_url
print('-' * 100)
print('1 iteration URL created: ' + create_url)
print('-' * 100)
else:
create_url = main_url + 'start_on_page=' + str(page_number) + '&show_per_page=' + str(page_size) + '&' + first_iteration_url
print('-' * 100)
print('Other then first iteration URL created: ' + create_url)
print('-' * 100)
page = requests.get(create_url,verify=False)
extracted_text = bs(page.text, 'lxml')
result = extracted_text.find_all('tr')
if len(result) > 0:
for idx, data in enumerate(result):
if idx > 0:
number_of_records +=1
name = data.contents[1].text
location = data.contents[3].text
profile_url = data.contents[5].contents[0].attrs['href']
list_of_credentials.append({
'#':number_of_records,
'Name':name,
'Location': location,
'Profile URL': profile_url
})
print(data)
with open(file_path + file_name ,'a+') as cred_CSV:
csvwriter = csv.DictWriter(cred_CSV, delimiter=',',lineterminator='\n',fieldnames=csv_headers)
if idx == 0 and count == 0:
print('Writing CSV header now...')
csvwriter.writeheader()
else:
for item in list_of_credentials:
print('Writing data rows now..')
print(item)
csvwriter.writerow(item)
list_of_credentials = []
count +=1
page_number +=20
scrap_directory_data()

Scraped data is not saving to csv file as it keeps returning a blank csv file.

Can anyone see what's wrong with this code?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
import csv
def races(main_url):
driver = webdriver.Chrome()
driver.get(main_url)
driver.implicitly_wait(2)
races = driver.find_elements_by_class_name('time-location')
races = [race.text[:5] for race in races]
races = [race.replace(':', '') for race in races]
driver.close()
return races
def scrape(url):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_elements_by_class_name('racecard-ajax-link')[1].click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//[#id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
for horse in driver.find_elements_by_class_name('card-item'):
horseName = horse.find_element_by_class_name('form-link').text
times = horse.find_elements_by_class_name('sectionals-time')
times = [time.text for time in times]
print('{}: {}'.format(horseName, times))
print()
driver.close()
So at this next point below I am trying to save the data to df, but it returns a blank doc when opened. Should df = open('jan1.csv', 'w+') not store the scraped data into the csv file. I'm obviously missing something but can't see what.
def main():
df = open('jan1.csv', 'w+')
df.close()
date = '1-January-2018'
main_url = 'http://www.attheraces.com/racecard/Southwell/' + date
for race in races(main_url):
url = main_url + '/' + race
print(url)
scrape(url)
if __name__ == '__main__':
main()
Your code seems broken in several places and even with fixing it I get Timeout errors.
Try these steps:
Add pandas for easy data handling:
import pandas as pd
def scrape(url):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_elements_by_class_name('racecard-ajax-link')[1].click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//[#id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
# add empty list to save scraped data
data = []
for horse in driver.find_elements_by_class_name('card-item'):
horseName = horse.find_element_by_class_name('form-link').text
times = horse.find_elements_by_class_name('sectionals-time')
times = [time.text for time in times]
print('{}: {}'.format(horseName, times))
data.append([horseName, times])
print()
driver.close()
# return your data!
return data
Then change this in your main function:
def main():
date = '1-January-2018'
main_url = 'http://www.attheraces.com/racecard/Southwell/' + date
tmp = []
for race in races(main_url):
url = main_url + '/' + race
print(url)
tmp.append(scrape(url))
df = pd.DataFrame(tmp)
df.to_csv("jan1.csv")
Or if you want to stick to csv only (no pandas):
with open("jan1.csv", "w+") as file:
file.write(your_data_var_here)

Categories

Resources