How can I export DataFrame to excel without subscribe?
For exemple:
I'm doing webscraping and there is a table with pagination, so I take the page 1 save it in DataFrame, export to excel e do it again in page 2. But every record is erased when a save it remaining the last one.
Sorry for my english, here is my code:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
i=1
url = "https://stats.nba.com/players/traditional/?PerMode=Totals&Season=2019-20&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1"
driver = webdriver.Firefox(executable_path=r'C:/Users/Fabio\Desktop/robo/geckodriver.exe')
driver.get(url)
time.sleep(5)
driver.find_element_by_xpath("/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[2]/div[1]/table/thead/tr/th[9]").click()
contador = 1
#loop pagination
while(contador < 4):
#findind table
elemento = driver.find_element_by_xpath("/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[2]")
html_content = elemento.get_attribute('outerHTML')
# 2. Parse HTML - BeaultifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find(name='table')
# 3. Data Frame - Pandas
df_full = pd.read_html(str(table))[0]
df = df_full[['PLAYER','TEAM', 'PTS']]
df.columns = ['jogador','time', 'pontuacao']
dados1 = pd.DataFrame(df)
driver.find_element_by_xpath("/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[1]/div/div/a[2]").click()
contador = contador + 1
#4. export to excel
dados = pd.DataFrame(df)
dados.to_excel("fabinho.xlsx")
driver.quit()
You are re-assigning df to whatever data you retrieved everytime you go through the loop. A solution would be to append the data to a list and then pd.concat the list at the end.
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
i=1
url = "https://stats.nba.com/players/traditional/?PerMode=Totals&Season=2019-20&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1"
driver = webdriver.Firefox(executable_path=r'C:/Users/Fabio\Desktop/robo/geckodriver.exe')
driver.get(url)
time.sleep(5)
driver.find_element_by_xpath("/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[2]/div[1]/table/thead/tr/th[9]").click()
contador = 1
df_list = list()
#loop pagination
while(contador < 4):
#findind table
elemento = driver.find_element_by_xpath("/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[2]")
html_content = elemento.get_attribute('outerHTML')
# 2. Parse HTML - BeaultifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find(name='table')
# 3. Data Frame - Pandas
df_full = pd.read_html(str(table))[0]
df = df_full[['PLAYER','TEAM', 'PTS']]
df.columns = ['jogador','time', 'pontuacao']
df_list.append(df)
driver.find_element_by_xpath("/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[1]/div/div/a[2]").click()
contador = contador + 1
#4. export to excel
dados = pd.concat(df_list)
dados.to_excel("fabinho.xlsx")
driver.quit()
Related
I need to set up a table as follows
date
time
Agua 6 Horas
Agua Diario
(QFE)
Radiación
Humedad
Viento promedio
Viento Temperatura del Aire Seco
this is the station link in table 6 there are the items of interest that fill each of the columns of the table that I intend to assemble
each desired element has an index in this table
the ones that matter are 57,60,64,89,66,28,26
I need to get the data from this table, and fill each of the columns of the table that I want to build with this data
when accessing an element of the table we have the data grouped in another table
That's how I thought of getting the data, I just can't gather them in a single table, if anyone can help me I would be grateful
import csv
from os import PRIO_PGRP
# from msilib.schema import tables
from pydoc import pager
from re import A
import re
from cv2 import pencilSketch
import requests
from bs4 import BeautifulSoup
import pandas as pd
from lxml import html
from urllib.parse import urljoin
# filename = "data.csv"
# f = open(filename, "w", newline ='')
# data = csv.writer(f)
pages = [
'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/57/119',
'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/60/125'
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/64/130',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/89/200',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/66/132',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330007/28/61',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330007/28/60',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/26/58'
]
def make_soup(url):
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
return soup
def get_links(url):
soup = make_soup(url)
a_tags = soup.find_all('a', href=re.compile(r"^/application/informacion/datosMensualesDelElemento/330006/"))
links = [urljoin(url, a.get('href')) for a in a_tags]
return links
def get_data(link):
soup = make_soup(link)
table = soup.find('table', {'class': 'table table-striped table-bordered'})
data = [[' '.join(item.text.split()) for item in tcel.select("td")]
for tcel in table.select("tr")]
return data
if __name__ == '__main__':
# data = []
# dataframe = []
# df = pd.DataFrame()
# for page in pages:
# make_soup(page)
# links = get_links(page)
# # print(links)
# for link in links:
# data.append(get_data(link))
# break
# dataframe.append(pd.DataFrame(data))
# df = pd.concat(dataframe, axis=0)
# print(df)
# make_soup(url)
# links = get_links(url)
# for link in links:
# data.append(get_data(link))
# print(data)
# break
# df = pd.DataFrame(data)
# df.to_csv('data.csv', index=False)
# df = pd.DataFrame(data)
# print(df)
columns_name = ['Data Medicao', 'Hora Medicao', 'RRR6 (mm)', 'Traza ()']
data = []
# for i in range(len(data)+1):
# print(data[i])
# df = pd.DataFrame(data)
for page in pages:
links = get_links(pages[0])
for link in links:
data.append(get_data(link))
df = pd.DataFrame(data)
df = df.iloc[1:]
df.to_csv('data.csv', index=False)
print(df)
# links = get_links(url)
# for link in links:
# data.writerows(get_data(link))
import openpyxl
xl_file = openpyxl.Workbook()
xl_sheet =xl_file.active
from urllib.request import urlopen
from bs4 import BeautifulSoup
stockItem = '028300'
url = 'http://finance.naver.com/item/sise_day.nhn?code='+ stockItem
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
maxPage=source.find_all("table",align="center")
mp = maxPage[0].find_all("td",class_="pgRR")
mpNum = int(mp[0].a.get('href')[-3:])
for page in range(1, 10):
print (str(page) )
url = 'http://finance.naver.com/item/sise_day.nhn?code=' + stockItem +'&page='+ str(page)
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
srlists=source.find_all("tr")
isCheckNone = None
if((page % 1) == 0):
time.sleep(0)
for i in range(1,len(srlists)-1):
if(srlists[i].span != isCheckNone):
srlists[i].td.text
data1 = srlists[i].find_all("td",align="center")
data2 = srlists[i].find_all("td",class_="num")
print(srlists[i].find_all("td",align="center")[0].text, srlists[i].find_all("td",class_="num")[0].text )
for item in data1:
xl_sheet.append([item.get_text()])
This is what I've done for crawling stock data from the site.
I've successfully crawled the data of stock.
However, I couldn't save the data into excel file.
I've tried it, but it only showed the date data without the price data.
How could I convert results to excel file?
There were 2 things you misssed,
1) Mistake in importing packages
2) Did not append data2 in excel which contains prices
Here is the final code which will give your desired output. Just put your folder location for saving excel file.
import time
from openpyxl import Workbook #
xl_file = Workbook()
xl_sheet =xl_file.active
from urllib.request import urlopen
from bs4 import BeautifulSoup
i = 0
stockItem = '028300'
url = 'http://finance.naver.com/item/sise_day.nhn?code='+ stockItem
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
maxPage=source.find_all("table",align="center")
mp = maxPage[0].find_all("td",class_="pgRR")
mpNum = int(mp[0].a.get('href')[-3:])
for page in range(1, 10):
print (str(page) )
url = 'http://finance.naver.com/item/sise_day.nhn?code=' + stockItem +'&page='+ str(page)
html = urlopen(url)
source = BeautifulSoup(html.read(), "html.parser")
srlists=source.find_all("tr")
isCheckNone = None
if((page % 1) == 0):
time.sleep(0)
for i in range(1,len(srlists)-1):
if(srlists[i].span != isCheckNone):
srlists[i].td.text
data1 = srlists[i].find_all("td",align="center")
data2 = srlists[i].find_all("td",class_="num")
#print(srlists[i].find_all("td",align="center")[0].text, srlists[i].find_all("td",class_="num")[0].text )
for item1,item2 in zip(data1,data2):
xl_sheet.append([item.get_text(),item2.get_text()])
print(xl_sheet)
xl_file.save(r'C:\Users\Asus\Desktop\vi.xlsx')
Suggestion : You can use Yahoofinance package for python to download stock data easily.
you can follow this link >> https://pypi.org/project/yfinance/
I am trying to write data into a csv after scraping using pandas dataframe, but the csv is empty even after program execution. The headers are written first but they are also overwritten when dataframe comes into action.
Here is the code:
from bs4 import BeautifulSoup
import requests
import re as resju
import csv
import pandas as pd
re = requests.get('https://www.farfeshplus.com/Video.asp?ZoneID=297')
soup = BeautifulSoup(re.content, 'html.parser')
links = soup.findAll('a', {'class': 'opacityit'})
links_with_text = [a['href'] for a in links]
headers = ['Name', 'LINK']
# this is output file, u can change the path as you desire, default is the working directory
file = open('data123.csv', 'w', encoding="utf-8")
writer = csv.writer(file)
writer.writerow(headers)
for i in links_with_text:
new_re = requests.get(i)
new_soup = BeautifulSoup(new_re.content, 'html.parser')
m = new_soup.select_one('h1 div')
Name = m.text
print(Name)
n = new_soup.select_one('iframe')
ni = n['src']
iframe = requests.get(ni)
i_soup = BeautifulSoup(iframe.content, 'html.parser')
d_script = i_soup.select_one('body > script')
d_link = d_script.text
mp4 = resju.compile(r"(?<=mp4:\s\[\')(.*)\'\]")
final_link = mp4.findall(d_link)[0]
print(final_link)
df = pd.DataFrame(zip(Name, final_link))
df.to_csv(file, header=None, index=False)
file.close()
df.head() returns:
0 1
0 ل h
1 ي t
2 ل t
3 ى p
4 s
0 1
0 ل h
1 ي t
2 ل t
3 ى p
4 s
Any suggestion ?
It seems you are using a mix of libraries to write to a csv, pandas handles all this nicely so there is no need to use the inbuilt csv module from python -
I've modified your code below - this should return your dataframe as a whole df and write it out as a csv.
also using Headers=None you were setting the columns to nothing, so they would be referenced by an index number.
from bs4 import BeautifulSoup
import requests
import re as resju
#import csv
import pandas as pd
re = requests.get('https://www.farfeshplus.com/Video.asp?ZoneID=297')
soup = BeautifulSoup(re.content, 'html.parser')
links = soup.findAll('a', {'class': 'opacityit'})
links_with_text = [a['href'] for a in links]
names_ = [] # global list to hold all iterable variables from your loops
final_links_ = []
for i in links_with_text:
new_re = requests.get(i)
new_soup = BeautifulSoup(new_re.content, 'html.parser')
m = new_soup.select_one('h1 div')
Name = m.text
names_.append(name) # append to global list.
print(Name)
n = new_soup.select_one('iframe')
ni = n['src']
iframe = requests.get(ni)
i_soup = BeautifulSoup(iframe.content, 'html.parser')
d_script = i_soup.select_one('body > script')
d_link = d_script.text
mp4 = resju.compile(r"(?<=mp4:\s\[\')(.*)\'\]")
final_link = mp4.findall(d_link)[0]
print(final_link)
final_links_.append(final_link) # append to global list.
df = pd.DataFrame(zip(names_, final_links_)) # use global lists.
df.columns = ['Name', 'LINK']
df.to_csv(file, index=False)
Below is my code for annual data return. Since quarterly is not a different link and rather a button I cannot figure out how to pull it. I have spent days and finally resorting to help.
End game is excel output with balance sheets, cash flows, etc but I need on quarterly basis.
Any help welcome. Thank you
import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd
import xlrd
def scrape_table(url):
page = requests.get(url)
tree = html.fromstring(page.content)
## page.content rather than page.text because html.fromstring implicitly expects bytes as input.)
table = tree.xpath('//table')
##XPath is a way of locating information in structured documents such as HTML or XML
assert len(table) == 1
df = pd.read_html(lxml.etree.tostring(table[0], method='html'))[0]
df = df.set_index(0)
# df = df.dropna()
df = df.transpose()
df = df.replace('-', '0')
# The first column should be a date
df[df.columns[0]] = pd.to_datetime(df[df.columns[0]])
cols = list(df.columns)
cols[0] = 'Date'
df = df.set_axis(cols, axis='columns', inplace=False)
numeric_columns = list(df.columns)[1::]
df[numeric_columns] = df[numeric_columns].astype(np.float64)
return df
loc= (r"F:\KateLaptop2019\Work\DataAnalysis\listpubliccompanies.xlsx")
wb=xlrd.open_workbook(loc)
sheet=wb.sheet_by_index(0)
sheet.cell_value(0,0)
companies=[]
for i in range(1,sheet.nrows):
companies.append((sheet.cell_value(i,1).strip()))
def annual_financials():
for item in companies:
try:
balance_sheet_url = 'https://finance.yahoo.com/quote/' + item + '/balance-sheet?p=' + item
download_destination = (r'F:\KateLaptop2019\Work\DataAnalysis\OilCompanyResearch\CompanyFinancials\BalanceSheet\\' + item + ".xlsx")
df_balance_sheet = scrape_table(balance_sheet_url)
df_balance_sheet.to_excel(download_destination)
except:
print(item,"key error")
pass
annual_financials()
I want to retrieve specific stat (PPDA) from multiple matchs on this site:
https//understat.com/match/xxxx
I have created the follow to parse the HTML and loop through each match using Python but i am struggling how to extract the specific stat and load it to a csv and to a graph. I am a beginner and any help would be appreciated!
Code:
import pandas as pd
import re
import random
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import datetime
import csv
for i in range(9577,9807):
ppda_url = 'https://understat.com/match/' + str(i)
ppda_data = requests.get(ppda_url)
ppda_html = ppda_data.content
xml
soup = BeautifulSoup(ppda_html, 'lxml')
options=webdriver.ChromeOptions()
driver = webdriver.Chrome(chrome_options=options)
driver.get(ppda_url)
soup = BeautifulSoup(driver.page_source, 'lxml')
To extract the data with BeautifulSoup and write it to a CSV file, first find the div element with the text PPDA. Then find the next div element with the class progress-value then the nextdiv element with the class progress-value and get the data from these two last divs. Write it to a csv file like so.
import requests
from bs4 import BeautifulSoup
import csv
with open('ppda.csv', 'w', newline='') as csvfile:
for i in range(9577,9807):
ppda_url = 'https://understat.com/match/' + str(i)
ppda_data = requests.get(ppda_url)
ppda_html = ppda_data.content
soup = BeautifulSoup(ppda_html, 'lxml')
ppda = soup.find("div", string='PPDA')
home = ppda.findNext('div', {'class':"progress-value"})
print (home.text, home.findNext('div', {'class':"progress-value"}).text)
writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
writer.writerow([home.text, home.findNext('div', {'class':"progress-value"}).text])
To graph it look at matplotlib for a start.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(columns=['HOME', 'AWAY'])
for i in range(9577,9807):
ppda_url = 'https://understat.com/match/' + str(i)
ppda_data = requests.get(ppda_url)
ppda_html = ppda_data.content
soup = BeautifulSoup(ppda_html, 'lxml')
ppda = soup.find("div", string='PPDA')
home = ppda.findNext('div', {'class':"progress-value"})
print (home.text, home.findNext('div', {'class':"progress-value"}).text)
df = df.append({'HOME': float(home.text), 'AWAY' : float(home.findNext('div', {'class':"progress-value"}).text)}, ignore_index=True)
#print (df)
df.to_csv("ppda2.csv", encoding='utf-8', index=False)
df.plot.bar()
plt.show()
Outputs: CSV file and Graph