Below is my code for annual data return. Since quarterly is not a different link and rather a button I cannot figure out how to pull it. I have spent days and finally resorting to help.
End game is excel output with balance sheets, cash flows, etc but I need on quarterly basis.
Any help welcome. Thank you
import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd
import xlrd
def scrape_table(url):
page = requests.get(url)
tree = html.fromstring(page.content)
## page.content rather than page.text because html.fromstring implicitly expects bytes as input.)
table = tree.xpath('//table')
##XPath is a way of locating information in structured documents such as HTML or XML
assert len(table) == 1
df = pd.read_html(lxml.etree.tostring(table[0], method='html'))[0]
df = df.set_index(0)
# df = df.dropna()
df = df.transpose()
df = df.replace('-', '0')
# The first column should be a date
df[df.columns[0]] = pd.to_datetime(df[df.columns[0]])
cols = list(df.columns)
cols[0] = 'Date'
df = df.set_axis(cols, axis='columns', inplace=False)
numeric_columns = list(df.columns)[1::]
df[numeric_columns] = df[numeric_columns].astype(np.float64)
return df
loc= (r"F:\KateLaptop2019\Work\DataAnalysis\listpubliccompanies.xlsx")
wb=xlrd.open_workbook(loc)
sheet=wb.sheet_by_index(0)
sheet.cell_value(0,0)
companies=[]
for i in range(1,sheet.nrows):
companies.append((sheet.cell_value(i,1).strip()))
def annual_financials():
for item in companies:
try:
balance_sheet_url = 'https://finance.yahoo.com/quote/' + item + '/balance-sheet?p=' + item
download_destination = (r'F:\KateLaptop2019\Work\DataAnalysis\OilCompanyResearch\CompanyFinancials\BalanceSheet\\' + item + ".xlsx")
df_balance_sheet = scrape_table(balance_sheet_url)
df_balance_sheet.to_excel(download_destination)
except:
print(item,"key error")
pass
annual_financials()
Related
I need to set up a table as follows
date
time
Agua 6 Horas
Agua Diario
(QFE)
Radiación
Humedad
Viento promedio
Viento Temperatura del Aire Seco
this is the station link in table 6 there are the items of interest that fill each of the columns of the table that I intend to assemble
each desired element has an index in this table
the ones that matter are 57,60,64,89,66,28,26
I need to get the data from this table, and fill each of the columns of the table that I want to build with this data
when accessing an element of the table we have the data grouped in another table
That's how I thought of getting the data, I just can't gather them in a single table, if anyone can help me I would be grateful
import csv
from os import PRIO_PGRP
# from msilib.schema import tables
from pydoc import pager
from re import A
import re
from cv2 import pencilSketch
import requests
from bs4 import BeautifulSoup
import pandas as pd
from lxml import html
from urllib.parse import urljoin
# filename = "data.csv"
# f = open(filename, "w", newline ='')
# data = csv.writer(f)
pages = [
'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/57/119',
'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/60/125'
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/64/130',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/89/200',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/66/132',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330007/28/61',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330007/28/60',
# 'https://climatologia.meteochile.gob.cl/application/informacion/inventarioComponentesPorEstacion/330006/26/58'
]
def make_soup(url):
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
return soup
def get_links(url):
soup = make_soup(url)
a_tags = soup.find_all('a', href=re.compile(r"^/application/informacion/datosMensualesDelElemento/330006/"))
links = [urljoin(url, a.get('href')) for a in a_tags]
return links
def get_data(link):
soup = make_soup(link)
table = soup.find('table', {'class': 'table table-striped table-bordered'})
data = [[' '.join(item.text.split()) for item in tcel.select("td")]
for tcel in table.select("tr")]
return data
if __name__ == '__main__':
# data = []
# dataframe = []
# df = pd.DataFrame()
# for page in pages:
# make_soup(page)
# links = get_links(page)
# # print(links)
# for link in links:
# data.append(get_data(link))
# break
# dataframe.append(pd.DataFrame(data))
# df = pd.concat(dataframe, axis=0)
# print(df)
# make_soup(url)
# links = get_links(url)
# for link in links:
# data.append(get_data(link))
# print(data)
# break
# df = pd.DataFrame(data)
# df.to_csv('data.csv', index=False)
# df = pd.DataFrame(data)
# print(df)
columns_name = ['Data Medicao', 'Hora Medicao', 'RRR6 (mm)', 'Traza ()']
data = []
# for i in range(len(data)+1):
# print(data[i])
# df = pd.DataFrame(data)
for page in pages:
links = get_links(pages[0])
for link in links:
data.append(get_data(link))
df = pd.DataFrame(data)
df = df.iloc[1:]
df.to_csv('data.csv', index=False)
print(df)
# links = get_links(url)
# for link in links:
# data.writerows(get_data(link))
I am unable to extract MCC details from PDF. I am able to extract other data with my code.
import tabula.io as tb
from tabula.io import read_pdf
pdf_path = "IR21_SVNMT_Telekom Slovenije d.d._20210506142456.pdf"
for df in df_list:
if 'MSRN Number Range(s)' in df.columns:
df = df.drop(df.index[0])
df.columns = df.columns.str.replace('\r', '')
df.columns = df.columns.str.replace(' ', '')
df.columns = df.columns.str.replace('Unnamed:0', 'CountryCode(CC)')
df.columns = df.columns.str.replace('Unnamed:1', 'NationalDestinationCode(NDC)')
df.columns = df.columns.str.replace('Unnamed:2', 'SNRangeStart')
df.columns = df.columns.str.replace('Unnamed:3', 'SNRangeStop')
break
msrn_table = (df[['CountryCode(CC)','NationalDestinationCode(NDC)','SNRangeStart','SNRangeStop']])
print (msrn_table)
The same logic I am trying to retrieve "Mobile Country Code (MCC)" details. But Pandas data frame is showing different data instead of what is there in PDF.
for df in df_list:
if 'Mobile Country Code (MCC)' in df.columns:
break
print (df)
Pandas output is given in this:
The actual content in pdf file is:
This code works
import pdfplumber
import re
pattern =re.compile(r'Mobile Network Code \(MNC\)[\r\n]+([^\r\n]+)')
#pattern =re.compile(r'Mobile\sNetwork\sCode\s\(MNC\)')
pdf = pdfplumber.open(pdf_path)
n = len(pdf.pages)
final = ""
for page in range(n):
data = pdf.pages[page].extract_text()
final = final + "\n" + data
mcc_mnc=" "
matches=pattern.findall(final)
mcc_mnc=mcc_mnc.join(matches)
mcc = mcc_mnc.split(" ")
actual_mcc =mcc[0]
actual_mnc=mcc[1]
print (actual_mcc)
print (actual_mnc)
I am trying to access data using web-scraping and making it into a data frame using pandas. With the following code, I am already able to get the data frame. I want to combine all the data frames with append into one large data frame.
import requests
import re
import pandas as pd
from urllib.parse import unquote
from json import loads
from bs4 import BeautifulSoup
# Download URL
url = "https://riwayat-file-covid-19-dki-jakarta-jakartagis.hub.arcgis.com/"
req = requests.get(url)
# Get encoded JSON from HTML source
encoded_data = re.search("window\.__SITE=\"(.*)\"", req.text).groups()[0]
# Decode and load as dictionary
json_data = loads(unquote(encoded_data))
# Get the HTML source code for the links
html_src = json_data["site"]["data"]["values"]["layout"]["sections"][1]["rows"][0]["cards"][0]["component"]["settings"]["markdown"]
# Parse it using BeautifulSoup
soup = BeautifulSoup(html_src, 'html.parser')
# Get links
links = soup.find_all('a')
# For each link...
link_list = []
id_list = []
date_list = []
dataframe_csv = []
for link in links:
if "2021" in link.text:
link_list.append(link.text+" - "+link.attrs['href'])
link_list.remove("31 Januari 2021 Pukul 10.00 - https://drive.google.com/file/d/1vd1tToQbx3A420KMDA63aKviLjgGPJMd/view?usp=sharing")
for i in link_list:
id_list.append(i.split("/")[5])
date_list.append(i.split("/")[0][:-21])
for ID in id_list:
dataframe_csv.append("https://docs.google.com/spreadsheets/d/"+ID+"/export?format=csv")
I want to combine all the data frames that I have by using a loop. For every loop, I want to remove the index 0 row and add a new column which is Date. The code is as follows:
date_num = 0
df_total = pd.DataFrame()
for i in dataframe_csv:
df = pd.read_csv(i)
df = df.drop(index=df.index[0], axis=0, inplace=True)
df = df.assign(Date = date_list[date_num])
date_num += 1
df_total.append(df,ignore_index=True)
The problem is, I get an error like this:
AttributeError Traceback (most recent call last)
<ipython-input-11-ef67f0a87a8e> in <module>
5 df = pd.read_csv(i)
6 df = df.drop(index=df.index[0], axis=0, inplace=True)
----> 7 df = df.assign(Date = date_list[date_num])
8
9 date_num += 1
AttributeError: 'NoneType' object has no attribute 'assign'
inplace=True modifies the dataframe directly, so either remove it:
date_num = 0
df_total = pd.DataFrame()
for i in dataframe_csv:
df = pd.read_csv(i)
df = df.drop(index=df.index[0], axis=0)
df = df.assign(Date = date_list[date_num])
date_num += 1
df_total.append(df,ignore_index=True)
Or not assign it back:
date_num = 0
df_total = pd.DataFrame()
for i in dataframe_csv:
df = pd.read_csv(i)
df.drop(index=df.index[0], axis=0, inplace=True)
df = df.assign(Date = date_list[date_num])
date_num += 1
df_total.append(df,ignore_index=True)
As mentioned in the documentation of drop:
inplace : bool, default False
If False, return a copy. Otherwise, do operation inplace and return None.
How can I export DataFrame to excel without subscribe?
For exemple:
I'm doing webscraping and there is a table with pagination, so I take the page 1 save it in DataFrame, export to excel e do it again in page 2. But every record is erased when a save it remaining the last one.
Sorry for my english, here is my code:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
i=1
url = "https://stats.nba.com/players/traditional/?PerMode=Totals&Season=2019-20&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1"
driver = webdriver.Firefox(executable_path=r'C:/Users/Fabio\Desktop/robo/geckodriver.exe')
driver.get(url)
time.sleep(5)
driver.find_element_by_xpath("/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[2]/div[1]/table/thead/tr/th[9]").click()
contador = 1
#loop pagination
while(contador < 4):
#findind table
elemento = driver.find_element_by_xpath("/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[2]")
html_content = elemento.get_attribute('outerHTML')
# 2. Parse HTML - BeaultifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find(name='table')
# 3. Data Frame - Pandas
df_full = pd.read_html(str(table))[0]
df = df_full[['PLAYER','TEAM', 'PTS']]
df.columns = ['jogador','time', 'pontuacao']
dados1 = pd.DataFrame(df)
driver.find_element_by_xpath("/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[1]/div/div/a[2]").click()
contador = contador + 1
#4. export to excel
dados = pd.DataFrame(df)
dados.to_excel("fabinho.xlsx")
driver.quit()
You are re-assigning df to whatever data you retrieved everytime you go through the loop. A solution would be to append the data to a list and then pd.concat the list at the end.
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
i=1
url = "https://stats.nba.com/players/traditional/?PerMode=Totals&Season=2019-20&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1"
driver = webdriver.Firefox(executable_path=r'C:/Users/Fabio\Desktop/robo/geckodriver.exe')
driver.get(url)
time.sleep(5)
driver.find_element_by_xpath("/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[2]/div[1]/table/thead/tr/th[9]").click()
contador = 1
df_list = list()
#loop pagination
while(contador < 4):
#findind table
elemento = driver.find_element_by_xpath("/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[2]")
html_content = elemento.get_attribute('outerHTML')
# 2. Parse HTML - BeaultifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find(name='table')
# 3. Data Frame - Pandas
df_full = pd.read_html(str(table))[0]
df = df_full[['PLAYER','TEAM', 'PTS']]
df.columns = ['jogador','time', 'pontuacao']
df_list.append(df)
driver.find_element_by_xpath("/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[1]/div/div/a[2]").click()
contador = contador + 1
#4. export to excel
dados = pd.concat(df_list)
dados.to_excel("fabinho.xlsx")
driver.quit()
I used python 3 and pandas to parse the daily close from WSJ into EXCEL. However, the daily close shown on the web page screen cannot be extracted. Here is the link: "https://quotes.wsj.com/index/COMP/historical-prices"
How to download the close data on screen into excel?
and how to download "DOWNLOAD A SPREADSHEET" button file into excel with another name like comp.xlxs ?
Here are the codes:
import requests
import pandas as pd
url = 'https://quotes.wsj.com/index/COMP/historical-prices'
jsonData = requests.get(url).json()
final_df = pd.DataFrame()
for row in jsonData['data']:
#row = jsonData['data'][1]
data_row = []
for idx, colspan in enumerate(row['colspan']):
colspan_int = int(colspan[0])
data_row.append(row['td'][idx] * colspan_int)
flat_list = [item for sublist in data_row for item in sublist]
temp_row = pd.DataFrame([flat_list])
final_df = final_df.append(temp_row, sort=True).reset_index(drop=True)
wait2 = input("PRESS ENTER TO CONTINUE.")
Follow UP question quotes:
#
url = 'https://quotes.wsj.com/index/HK/XHKG/HSI/historical-prices/download?num_rows=15&range_days=15&endDate=12/06/2019'
response = requests.get(url)
open('HSI.csv', 'wb').write(response.content)
read_file = pd.read_csv (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\HSI.csv')
read_file.to_excel (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\HSI.xlsx', index = None, header=True)
#
url = 'https://quotes.wsj.com/index/SPX/historical-prices/download?num_rows=15&range_days=15&endDate=12/06/2019'
response = requests.get(url)
open('SPX.csv', 'wb').write(response.content)
read_file = pd.read_csv (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\SPX.csv')
read_file.to_excel (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\SPX.xlsx', index = None, header=True)
#
url = 'https://quotes.wsj.com/index/COMP/historical-prices/download?num_rows=15&range_days=15&endDate=12/06/2019'
response = requests.get(url)
open('COMP.csv', 'wb').write(response.content)
read_file = pd.read_csv (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\COMP.csv')
read_file.to_excel (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\COMP.xlsx', index = None, header=True)
the URL is wrong; once downloaded you can do "Get Info" if on a Mac, and you'll see "Where From:". You will see it's of the form below.
import requests
import pandas as pd
import io
#original URL had a bunch of other parameters I omitted, only these seem to matter but YMMV
url = 'https://quotes.wsj.com/index/COMP/historical-prices/download?num_rows=360&range_days=360&endDate=11/06/2019'
response = requests.get(url)
#do this if you want the CSV written to your machine
open('test_file.csv', 'wb').write(response.content)
# this decodes the content of the downloaded response and presents it to pandas
df_test = pd.read_csv(io.StringIO(response.content.decode('utf-8')))
To answer your additional question -- you can simply loop across a list of tickers or symbols, something like:
base_url = 'https://quotes.wsj.com/index/{ticker_name}/historical-prices/download?num_rows=360&range_days=360&endDate=11/06/2019'
ticker_list = ['COMP','SPX','HK/XHKG/HSI']
for ticker in ticker_list:
response = requests.get(base_url.format(ticker_name = ticker))
#do this if you want the CSV written to your machine
open('prices_'+ticker.replace('/','-')+'.csv', 'wb').write(response.content)
Note for HK/XHKG/HSI, we need to replace the slashes with hyphens or it's not a valid filename. You can also use this pattern to make dataframes.