I wrote these codes to read Datastes:
import pandas as pd
import pathlib
def load_coordinates(structure,segments:None):
filedir = pathlib.Path(__file__).parent.parent / "data" / structure
filepath = filedir / f"coordinates_{structure}.csv"
return pd.read_csv(filepath, sep=";")
if segments:
return df.loc[df.segment.isin(segments)].reset_index(drop=True)
else:
return df
def load_population(structure,segments:None):
filedir = pathlib.Path(__file__).parent.parent / "data" / structure
filepath = filedir / f"population_{structure}.csv"
return pd.read_csv(filepath, sep=";")
return df.loc[df.segment.isin(segments)].reset_index(drop=True)
else:
return df
def load_ambul_praxen(structure,segments=None):
filedir = pathlib.Path(__file__).parent.parent / "data" / structure
filepath = filedir / f"ambul_praxen_{structure}.csv"
return pd.read_csv(filepath, sep=";").drop(columns=["planet"])
return df.loc[df.segment.isin(segments)].reset_index(drop=True)
else:
return df
def load_docs_matrix(structure, docs_selection, sum_up_vo):
filedir = pathlib.Path(__file__).parent.parent / "data" / structure
filepath = filedir / f"docs_matrix_{structure}.csv"
return pd.read_csv(filepath, sep=";")
selected_cols = ["segment"] + docs_selection
df = df.loc[:, selected_cols]
if sum_up_vo:
df["vo"] = df[docs_selection].sum(axis=1)
df = df.drop(columns=docs_selection)
return df
def load_weights(structure, request, segments=None, docs_selection=None, sum_up_vo=None):
if request == "population":
return load_population(structure)
elif request == "ambul_praxen":
return load_ambul_praxen(structure)
else:
return load_docs_matrix(structure)
def load_data(structure, request, segments=None, docs_selection=None,
sum_up_vo=None):
coordinates = load_coordinates(structure)
weights = load_weights(structure, request, segments, docs_selection, sum_up_vo)
return coordinates.merge(weights, on="segment", how="inner")
before calling the function, I want to add this:
if segments:
return df.loc[df.segment.isin(segments)].reset_index(drop=True)
else:
return df
to def load_docs_matrix function
I called the function like this:
request = "ambul_praxen"
structure = "1868"
load_data(structure,request,
load_coordinates,
load_ambul_praxen,
load population,
load_weights)
What I want to achieve is put all the datasets in one dataframe with the load data function.after calling the function , I got an error that I gave 7 arguments but 5 is expected.I tried to adjust the arguments but it seems not to work.
Any idea on how I could solve this?
I'm not sure I understood the question, but if you just want to call these reading dataframe's functions, just:
def load_data(input_df, structure, request):
coordinates = load_coordinates(structure)
population = load_population(structure)
ambul_praxen = load_ambul_praxen(structure)
docs_matrix = load_docs_matrix(structure)
weighs = load_weights(structure,request)
#Maybe concatenate them horizontally?
return pd.concat([coordinates,population,ambul_praxen,docs_matrix,weighs],axis=1)
Or just return them one by one...
Related
I have a class that reads a dataframe and then another class which processes that dataframe. the functions in the processing class should be applied on the same dataframe step by step to shape the final dataframe which is then saved as a csv file.
from pydantic import BaseModel
from config import DATA_REPO
import pandas as pd
import os
class PandaDataFrame(BaseModel):
data: pd.DataFrame
class Config:
arbitrary_types_allowed = True
class Directory(BaseModel):
data_directory: str
class DataToPandaReader(object):
def csv_file_reader(self, directory: Directory):
directory = directory.data_directory
for file in os.listdir(directory):
if file.endswith('.csv'):
return pd.read_csv(os.path.join(directory, file))
class DataProcessor(object):
def remove_punctuation(self, my_: PandaDataFrame):
my_data_to_process = my_.data
for col in my_data_to_process:
if any(word in col for word in ['example', 'text', 'Answer']):
my_data_to_process = my_data_to_process[col].str.replace('[^\w\s]', '', regex=True)
return add_number_column(my_data_to_process)
def add_number_column(self, my_: PandaDataFrame):
my_data_to_process = my_.data
my_data_to_process['sentence_number'] = range(len(my_data_to_process))
return save_final_dataframe(my_data_to_process)
def save_final_dataframe(self, my_:PandaDataFrame):
my_data_to_process = my_.data
return my_data_to_process.to_csv('final_data.csv')
def parse_data_process(directory_to_csv_file):
toprocess = DataProcessor()
toprocess.save_final_dataframe(directory_to_csv_file)
toprocess.remove_punctuation(directory_to_csv_file)
toprocess.add_number_column(directory_to_csv_file)
return toprocess
if __name__ == '__main__':
parse_data_process(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
now, for example to instantiate the first function in DataProcessor class, I would do the following
DataProcessor().remove_punctuation(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
but my intention is to run all these function in the DataProcessor class step by step, so the save_final_dataset function would save the dataframe that is has its punctuation removed and also has a number column.
update:
following the answer given, I made these changes, but get the error that the functions are not known.
def parse_data_process(directory_to_csv_file):
toprocess = DataProcessor()
toprocess.save_final_dataframe(directory_to_csv_file)
toprocess.remove_punctuation(directory_to_csv_file)
toprocess.add_number_column(directory_to_csv_file)
return toprocess
if __name__ == '__main__':
parse_data_process(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
Unless I've misunderstood your use-case, all you need to do is replace
return my_data_to_process
...in the remove_punctuation function with
return add_number_column(my_data_to_process)
...then replace
return my_data_to_process
...in the add_number_column function with
return save_final_dataframe(my_data_to_process)
My goal is to use a list of tickers together with the TWS API to extract parts of the company snapshot (reqFundamentalData() -> "ReportSnapshot") and the financial statements (reqFundamentalData() -> "ReportsFinStatements") of these tickers, convert into a dataframe and store it as a parquet file.
I tried to merge solutions provided:
use a list of tickers
TWS API to download stock fundamental data runs only the first data entry and ignores the others. Whow to solve this?
store XML as dataframe
Converting XML to Pandas
store data
Save data from TWS API to csv file
Code:
from datetime import datetime
from bs4 import BeautifulSoup as bs
import pandas as pd
from ibapi.client import EClient
from ibapi.contract import Contract
from ibapi.wrapper import EWrapper
import logging
import random
import pathlib
import time
from datetime import date
import datetime
from pathlib import Path
class TestApp(EWrapper, EClient):
def __init__(self, addr, port, client_id):
EWrapper.__init__(self) # new - book
EClient.__init__(self, self)
self.firstReqId = 8001
self.contracts = {} # keep in dict so you can lookup
self.contNumber = self.firstReqId
# add dataframes to store the result
self.df_company_info = pd.DataFrame(data=None, index=None, columns=None)
self.df_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
def addContracts(self, cont):
self.contracts[self.contNumber] = cont # add to dict using 8001 first time
self.contNumber += 1 # next id will be 8002 etc.
def nextValidId(self, orderId: int):
# now you are connected, ask for data, no need for sleeps
# this isn't the only way to know the api is started but it's what IB recommends
self.contNumber = self.firstReqId # start with first reqId
self.getNextData()
def error(self, reqId, errorCode, errorString):
print("Error: ", reqId, "", errorCode, "", errorString)
# if there was an error in one of your requests, just contimue with next id
if reqId > 0 and self.contracts.get(self.contNumber):
# err in reqFundametalData based on reqid existing in map
print('err in', self.contracts[reqId].symbol)
self.getNextData() # try next one
def fundamentalData(self, reqId, fundamental_data):
self.fundamentalData = fundamental_data
try:
if self.fundamentalData is not None:
# convert XML to dictionary entry
dict_company_info = self.CompanyInfoXMLtoDict(self.fundamentalData)
# add dict entry to dataframe
df_add_row = pd.DataFrame([dict_company_info])
self.df_company_info = self.df_company_info.append(df_add_row, ignore_index=True)
except KeyError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
except TypeError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
except ValueError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
except IndexError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
self.getNextData()
def getNextData(self):
if self.contracts.get(self.contNumber): # means a contract exists
# so req data
self.reqFundamentalData(self.contNumber, self.contracts[self.contNumber], "ReportSnapshot", [])
self.contNumber += 1 # now get ready for next request
else: # means no more sequentially numbered contracts
print('done')
self.disconnect() # just exit
def CompanyInfoXMLtoDict(self, fundamentals):
soup = bs(fundamentals, 'xml')
df_company_info = pd.DataFrame(data=None, index=None, columns=None)
ticker = ''
longName = ''
fullTimeEmployees = 0
# search for a tag e.g. </IssueID>
for issues in soup.find_all('IssueID'):
# within this tag -> search of unique ID e.g. IssueID type=...
if issues.get('Type') == "Ticker":
ticker = issues.get_text()
break
for coID_i in soup.find_all('CoID'):
if coID_i.get('Type') == "CompanyName":
longName = coID_i.get_text()
break
for employees_i in soup.find_all('Employees'):
fullTimeEmployees = employees_i.get_text()
break
# create result entry row
if ticker is not None and ticker != '':
new_row_dict = {'ticker': ticker, 'name': longName,
'num_employees': fullTimeEmployees}
else:
new_row_dict = {}
return new_row_dict
def FinStmtsXMLtoDF(self, fundamentals, ticker, stmts_type):
today = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
today_date = date.today().strftime("%Y-%m-%d")
if stmts_type == 'annual':
period_type = 'Annual'
else:
period_type = 'Interim'
soup = bs(fundamentals, 'xml')
# build dict
stmts_terms = {}
for terms in soup.find_all("mapItem"):
# add entry to dict -> dict for maping of code to description
stmts_terms[terms.get('coaItem')] = terms.get_text()
bal_l = []
inc_l = []
cas_l = []
for period in soup.find_all('FiscalPeriod'):
# quarterly vs. annually
if period.get('Type') == period_type:
for statement in period.find_all('Statement'):
if statement.find('UpdateType').get('Code') != 'CLA':
dic = {}
stmts_type = statement.get('Type')
# source_date = statement.find('Source').get('Date')
statement_date = statement.find('StatementDate').text
# dic['date'] = source_date
dic['rep_date'] = statement_date
for item in statement.find_all('lineItem'):
# dic[item.get('coaCode')] = item.text
dic[stmts_terms.get(item.get('coaCode'), 'DEFAULT')] = item.text
if stmts_type == 'BAL':
bal_l.append(dic)
# print(stmts_type, date, dic)
elif stmts_type == 'INC':
inc_l.append(dic)
elif stmts_type == 'CAS':
cas_l.append(dic)
df_balance_sheet = pd.DataFrame(bal_l).sort_values('rep_date')
df_income_statement = pd.DataFrame(inc_l).sort_values('rep_date')
df_cash_flow = pd.DataFrame(cas_l).sort_values('rep_date')
# merge all stmts for same rep_date
df_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
df_fin_stmts = df_balance_sheet.merge(df_income_statement, how='left',
left_on=['rep_date'],
right_on=['rep_date'])
df_fin_stmts = df_fin_stmts.merge(df_cash_flow, how='left',
left_on=['rep_date'],
right_on=['rep_date'])
df_fin_stmts.insert(loc=0, column='ticker', value=ticker)
df_fin_stmts.insert(loc=1, column='date_updated', value=today_date)
return df_fin_stmts
def main():
# ----- config
project_data_folder = '/home/data/'
project_data_folder = Path(project_data_folder)
# ticker are stored in a csv file
csv_master_ticker = Path('home/data/ticker/ticker-list.csv')
# load list of tickers
df = pd.read_csv(csv_master_ticker)
list_master_ticker = df['ticker'].tolist()
fusion_company_info = pd.DataFrame(data=None, index=None, columns=None)
fusion_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
fusion_q_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
client = TestApp('127.0.0.1', 7496, 0)
for ticker in list_master_ticker:
# remove additional postfix for exchange e.g. XYZ.F -> XYZ
ticker_clean = ticker.rstrip('.')
contract = Contract()
contract.symbol = ticker_clean
contract.secType = 'STK'
contract.exchange = "SMART"
contract.currency = 'USD'
client.addContracts(contract)
client.connect('127.0.0.1', 7496, 0)
client.run()
if fusion_company_info.empty:
fusion_company_info = client.df_company_info
else:
fusion_company_info = pd.concat([fusion_company_info, client.df_company_info])
tws_company_info_file_name = 'tws_company_info.parquet'
file_name = project_data_folder / tws_company_info_file_name
try:
if fusion_company_info is not None:
if not fusion_company_info.empty:
fusion_company_info.to_parquet(file_name, engine='pyarrow')
# financial statements - annual
tws_fin_stmts_file_name = 'tws_fin_stmts.parquet'
file_name = project_data_folder / tws_fin_stmts_file_name
try:
if fusion_fin_stmts is not None:
if not fusion_fin_stmts.empty:
fusion_fin_stmts.to_parquet(file_name, engine='pyarrow')
I get an error message
Traceback (most recent call last):
File "...\ibapi\client.py", line 239, in run
self.decoder.interpret(fields)
File "...\ibapi\decoder.py", line 1278, in interpret
self.interpretWithSignature(fields, handleInfo)
File "...\ibapi\decoder.py", line 1259, in interpretWithSignature
method(*args)
TypeError: 'str' object is not callable
python-BaseException
Can someone help me with this error message?
If I remove the for loop and run it only for a single ticker e.g.
client.contracts = {}
contract = Contract()
contract.symbol = 'AMD'
contract.secType = 'STK'
contract.currency = 'USD'
contract.exchange = "SMART"
client.addContracts(contract)
client.connect('127.0.0.1', 7496, 0)
client.run()
I don't get a error message and the dataframe self.company_info get's populated with the correct data of AMD.
General questions:
Is it possible to get via reqFundamentalData() not only the company info "ReportSnapshot", but also the financial statements "ReportsFinStatements" (df_fin_stmts and the function "FinStmtsXMLtoDF") in one request/run ?
I new to python and would expect functions are only executed, if the functions is called within the code, but somehow with the the TWS API (socket, reqID) it seems to work different and it's not fully clear to me when which funciton is called after one and another.
e.g. how do I know that by executing reqFundamentalData() the function fundamentalData() is called. Or e.g. nextValidID() is somehow triggered, but not explicit called within the program. Is there a good tutorial to introduce the process of what functions are called in which order?
Thank you very much
I've written a simple script to save out the names of various subfolders into a spreadsheet. It seems to be doing its job at every point up to the return statement. It returns None...
If I add a print statement before the return I can see a populated dataFrame.
I guess I'm missing something obvious, would appreciate some help!
Thanks
import sys, os, glob
from glob import glob
import pandas as pd
def findSubFoldersMultiple(iter,data_container):
if iter > 0:
current_directory = sys.argv[iter]
directory_reformatted = sys.argv[iter] + "/*/"
folders = glob(directory_reformatted )
folders_stripped = [ folder.replace(sys.argv[iter],'').replace('/','') for folder in folders]
curr_data_container = pd.DataFrame({ current_directory: folders_stripped })
combined_data_container = pd.concat([data_container,curr_data_container],axis=1)
findSubFoldersMultiple(iter-1,combined_data_container)
else:
print('Populated container in loop: \n' )
print(data_container)
return data_container
if len(sys.argv)<2:
print ("Please specify directory/directories.")
else:
writer = pd.ExcelWriter('subfolders.xlsx')
empty_frame = pd.DataFrame({})
populated_DF = findSubFoldersMultiple(len(sys.argv) - 1, empty_frame)
print('Returned container: \n' )
print(populated_DF)
Catch the return value by changing the last line in the if block to:
return findSubFoldersMultiple(iter-1,combined_data_container)
Otherwise you're returning the value on the base case (the else block), but not returning it further up the chain of non-base case recursive calls.
This question already has an answer here:
Python: One Try Multiple Except
(1 answer)
Closed 4 years ago.
Essentially I am able to do what I want with my current code, which is upload a csv file, manipulate it with pandas and then update a MSQL Database. I would like to add error handling somehow. Essentially the upload function will only work for one particular file and throw different errors for all others.
Is there a way that I can catch multiple errors and return an error message to the user ?
Possibly something like a check on the input csv file column headers.
#app.route('/upload', methods =['GET', 'POST'])
def csv_input():
tempfile_path = tempfile.NamedTemporaryFile().name
#file.save(tempfile_path)
#sheet = pd.read_csv(tempfile_path)
if request.method == 'POST':
file = request.files['file']
if file: #and allowed_filename(file.filename):
#filename = secure_filename(file.filename)
file.save(tempfile_path)
input_csv = pd.read_csv(tempfile_path,sep=",", engine='python')
#### Data Cleansing From Uploded Data
col_titles = ['id','title','vote_average','w_average','vote_count','year','runtime',
'budget','revenue','profit']
# Only Keep Data where the Original Language is English
input_csv = input_csv[input_csv['original_language']=='en']
# New Dataframe that only contains data with vote count > 10
input_csv = input_csv[input_csv['vote_count'] >= 10]
# Fill all NA values to 0 - Needed to set datatypes
input_csv = input_csv.fillna(0)
# Remove all Rows with no Runtime
input_csv = input_csv[input_csv['runtime']!=0]
# Revmove all duplciate Rows
input_csv = input_csv.drop_duplicates()
input_csv['vote_average'] = input_csv.vote_average.astype(float).round(1)
input_csv.vote_average.round(1)
input_csv['runtime'] = input_csv.runtime.astype(int)
input_csv['vote_count'] = input_csv.vote_count.astype(int)
input_csv['revenue'] = input_csv.revenue.astype('int64')
input_csv['budget'] = input_csv.budget.astype('int64')
profit_cal(input_csv,'revenue','budget','profit')
input_csv['profit']=input_csv.profit.astype('int64')
input_csv['profit']=input_csv.profit.replace(0,'No Data')
#reorder_data = pd.DataFrame(input_csv)
# Year Cleaning
input_csv['year'] = pd.to_datetime(input_csv['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
#C = reorder_data['vote_average'].mean()
#m = reorder_data['vote_count'].quantile(0.10)
#w_average = org_data.copy().loc[reorder_data['vote_count'] >= m]
#### IMDB Data Calculation
V = input_csv['vote_count']
R = input_csv['vote_average']
C = input_csv['vote_average'].mean()
m = input_csv['vote_count'].quantile(0.10)
input_csv['w_average'] = (V/(V+m) * R) + (m/(m+V) * C)
#C = input_csv['vote_average'].mean()
#m = input_csv['vote_count'].quantile(0.10)
#input_csv['w_average'] = input_csv.apply(weighted_rating, axis = 1)
input_csv['w_average'] = input_csv.w_average.astype(float).round(1)
reorder_data = input_csv[col_titles]
reorder_data.to_sql(name='title_data', con=engine, if_exists = 'replace', index=False)
# Reorder the data and output in the correct order
##### Genre Loads == DataFrame 2
df = input_csv
v = df.genres.apply(json.loads)
df = pd.DataFrame(
{
'id' : df['id'].values.repeat(v.str.len(), axis=0),
'genre' : np.concatenate(v.tolist())
})
df['genre'] = df['genre'].map(lambda x: x.get('name'))
genre_data = df.genre.str.get_dummies().sum(level=0)
genre_data = df.loc[(df!=0).any(1)]
#genre_data = genre_data.set_index('id')
genre_order = ['id','genre']
## Dataframw to SQL
genre_data[genre_order].to_sql(name='genre_data', con=engine, if_exists = 'replace', index=False)
####### Keyword Search ### Dataframe
#genre_data.to_csv("genre_data.csv")
#return genre_data[genre_order].to_html()
flash('Database has been updated successfully','success')
#return reorder_data[col_titles].to_html()
#stream = io.StringIO(file.stream.read().decode("UTF8"), newline=None)
#csv_input = csv.reader(stream)
#return reorder_data.to_html(index=False)
#flash('File Uploaded Successfully')
#return redirect(url_for('index'))
return render_template('upload.html')
There are several methods:
The Python way, just add try: and except with the relevant exception classes.
try:
# parsing & processing logic here
pass
except EmptyDataError as ex:
# tell user we don't except empty data
pass
except ParserError as ex:
# tell user we failed to parse their input
pass
except Exception as ex:
# tell user that something went wrong.
pass
The Flask way, register error handlers with flask for specific exceptions (this effects the whole flask application):
#app.errorhandler(pandas.errors.EmptyDataError)
def handle_empty_data():
return 'Failed parsing Input', 200
I am trying to incrementally load values from the first column of a csv file into a URL and request the URL one at a time with a 5 second delay. Each value in the first column should replace "theid"
This is my code so far:
# I have a defined function
def withid (theid):
""""""
global cache
dupe = False
theurl = "{0}{1}{2}".format(OMDBURL, "?i=", theid)
response = urllib2.urlopen(theurl)
movdata = json.load(response)
for mov in cache:
if movdata[MKEY[1]] == mov[MKEY[1]]:
dupe = True
if not dupe:
cache.append(movdata)
outfile2 = open('outputrows2-shortened.csv', 'rb')
for row in outfile2:
theid = outfile2(row[0])
time.sleep(5)
output: TypeError: 'file' object is not callable
Your withid() function never returns anything. Try adding return movdata at the end of it.
Here is a rewritten version which may be helpful:
import csv
import json
import time
import urllib2
PAGE_DELAY = 5. # time between loading pages
PAGE_LOAD = 0.3 # how long it takes to load a page
make_url = 'http://www.imdb.com/title/tt{}/'.format
def get_csv_column(csv_fname, col, **kwargs):
with open(csv_fname, 'rb') as inf:
incsv = csv.reader(inf, **kwargs)
column = [row[col] for row in incsv]
return column
def get_data_by_id(id):
url = make_url(id)
response = urllib2.urlopen(url)
data = json.load(response)
return id,data
def delayed(delay, fn, *args):
time.sleep(delay)
return fn(*args)
def human_time(seconds):
if seconds >= 86400:
return '{:0.1f} days'.format(seconds / 86400.)
elif seconds >= 3600:
return '{:0.1f} hours'.format(seconds / 3600.)
elif seconds >= 60:
return '{:0.1f} minutes'.format(minutes / 60.)
else:
return '{:0.1f} seconds'.format(seconds)
def main():
ids = get_csv_column('outputrows2.csv', 0)
expected = (PAGE_DELAY + PAGE_LOAD) * len(ids)
print('This will take about {}.'.format(human_time(expected)))
results = (delayed(PAGE_DELAY, get_data_by_id, id) for id in ids)
moviedata = dict(results) # => gives dict of {id:data}
if __name__=="__main__":
main()