How do I import various Data Frames from different python file? - python

I have a python file called 'clean_data.py' which has all the data frames I need, and I want to import them for use in another python file called 'main.py' to use in creating a dashboard.
Is it possible to create a class in my clean_data.py, and if so can someone direct me to an article (which I struggled to find so far) so that I can figure it out?
The aim is to shift from CSV to an API overtime, so I wanted to keep data side wrangling side of things in a different file while the web app components in the main.py file.
Any help would be much appreciated.
The code from the clean_data.py is:
import pandas as pd
import csv
import os # To access my file directory
print(os.getcwd()) # Let's me know the Current Work Directory
fdi_data = pd.read_csv(r'Data/fdi_data.csv')
fdi_meta = pd.read_csv(r'Data/fdi_metadata.csv')
debt_data = pd.read_csv(r'Data/debt_data.csv')
debt_meta = pd.read_csv(r'Data/debt_metadata.csv')
gdp_percap_data = pd.read_csv(r'Data/gdp_percap_data.csv', header=2)
gdp_percap_meta = pd.read_csv(r'Data/gdp_percap_metadata.csv')
gov_exp_data = pd.read_csv(r'Data/gov_exp_data.csv', header=2)
gov_exp_meta = pd.read_csv(r'Data/gov_exp_metadata.csv')
pop_data = pd.read_csv(r'Data/pop_data.csv', header=2)
pop_meta = pd.read_csv(r'Data/pop_metadata.csv')
"""
'wb' stands for World Bank
"""
def wb_merge_data(data, metadata):
merge = pd.merge(
data,
metadata,
on = 'Country Code',
how = 'inner'
)
return merge
fdi_merge = wb_merge_data(fdi_data, fdi_meta)
debt_merge = wb_merge_data(debt_data, debt_meta)
gdp_percap_merge = wb_merge_data(gdp_percap_data, gdp_percap_meta)
gov_exp_merge = wb_merge_data(gov_exp_data, gov_exp_meta)
pop_merge = wb_merge_data(pop_data, pop_meta)
def wb_drop_data(data):
drop = data.drop(['Country Code','Indicator Name','Indicator Code','TableName','SpecialNotes','Unnamed: 5'], axis=1)
return drop
fdi_merge = wb_drop_data(fdi_merge)
debt_merge = wb_drop_data(debt_merge)
gdp_percap_merge = wb_drop_data(gdp_percap_merge)
gov_exp_merge = wb_drop_data(gov_exp_merge)
pop_merge = wb_drop_data(pop_merge)
def wb_mr_data(data, value_name):
data = data.melt(['Country Name','Region','IncomeGroup']).reset_index()
data = data.rename(columns={'variable': 'Year', 'value': value_name})
data = data.drop('index', axis = 1)
return data
fdi_merge = wb_mr_data(fdi_merge, 'FDI')
debt_merge = wb_mr_data(debt_merge, 'Debt')
gdp_percap_merge = wb_mr_data(gdp_percap_merge, 'GDP per Cap')
gov_exp_merge = wb_mr_data(gov_exp_merge, 'Gov Expend.')
pop_merge = wb_mr_data(pop_merge, 'Population')
def avg_groupby(data, col_cal, cn=False, ig=False, rg=False):
if cn == True:
return data.groupby('Country Name')[col_cal].mean().reset_index()
elif ig == True:
return data.groupby('IncomeGroup')[col_cal].mean().reset_index()
elif rg == True:
return data.groupby('Region')[col_cal].mean().reset_index()
"""
avg_cn_... For country
avg_ig_... Income Group
avg_rg_... Region
"""
avg_cn_fdi = avg_groupby(fdi_merge, 'FDI', cn=True)
avg_ig_fdi = avg_groupby(fdi_merge, 'FDI', ig=True)
avg_rg_fdi = avg_groupby(fdi_merge, 'FDI', rg=True)
avg_cn_debt = avg_groupby(debt_merge, 'Debt', cn=True)
avg_ig_debt = avg_groupby(debt_merge, 'Debt', ig=True)
avg_rg_debt = avg_groupby(debt_merge, 'Debt', rg=True)
avg_cn_gdp_percap = avg_groupby(gdp_percap_merge, 'GDP per Cap', cn=True)
avg_ig_gdp_percap = avg_groupby(gdp_percap_merge, 'GDP per Cap', ig=True)
avg_rg_gdp_percap = avg_groupby(gdp_percap_merge, 'GDP per Cap', rg=True)
avg_cn_gexp = avg_groupby(gov_exp_merge, 'Gov Expend.', cn=True)
avg_ig_gexp = avg_groupby(gov_exp_merge, 'Gov Expend.', ig=True)
avg_rg_gexp = avg_groupby(gov_exp_merge, 'Gov Expend.', rg=True)
avg_cn_pop = avg_groupby(pop_merge, 'Population', cn=True)
avg_ig_pop = avg_groupby(pop_merge, 'Population', ig=True)
avg_rg_pop = avg_groupby(pop_merge, 'Population', rg=True)

In Python, every file is a module. So if you want to re-use your code, you can simple import this module. For example,
# main.py
import clean_data
print(clean_data.avg_cn_fdi)
Maybe you needn't create a class for this

You can import the whole python file like you'd import any other locally created files and have access to the DataFrames in them. Here's an example:
I created a file called temporary.py:
import pandas as pd
data = pd.read_csv("temp.csv")
And then in a separate file I was able to use data like so:
import temporary
print(temporary.data)
Or, you could also do:
from temporary import data
print(data)
All that being said, I don't believe that this would be the best way to handle your data.

Related

Streamlit: Display Text after text, not all at once

I want to build a data annotation interface. I read in an excel file, where only a text column is relevant. So "df" could also be replaced by a list of texts. This is my code:
import streamlit as st
import pandas as pd
import numpy as np
st.title('Text Annotation')
df = pd.read_excel('mini_annotation.xlsx')
for i in range(len(df)):
text = df.iloc[i]['Text']
st.write(f"Text {i} out of {len(df)}")
st.write("Please classify the following text:")
st.write("")
st.write(text)
text_list = []
label_list = []
label = st.selectbox("Classification:", ["HOF", "NOT","Not Sure"])
if st.button("Submit"):
text_list.append(text)
label_list.append(label)
df_annotated = pd.DataFrame(columns=['Text', 'Label'])
df_annotated["Text"] = text_list
df_annotated["Label"] = label_list
df_annotated.to_csv("annotated_file.csv", sep=";")
The interface looks like this:
However, I want the interface to display just one text, e.g. the first text of my dataset. After the user has submitted his choice via the "Submit" button, I want the first text to be gone and the second text should be displayed. This process should continue until the last text of the dataset is reached.
How do I do this?
(I am aware of the Error message, for this I just have to add a key to the selectbox, but I am not sure if this is needed in the end)
This task is solved with the help of the session state. U can read about it here: session state
import streamlit as st
import pandas as pd
import numpy as np
st.title('Text Annotation')
df = pd.DataFrame(['Text 1', 'Text 2', 'Text 3', 'Text 4'], columns=['Text'])
if st.session_state.get('count') is None:
st.session_state.count = 0
st.session_state.label_list = []
with st.form("my_form"):
st.write("Progress: ", st.session_state.count, "/", len(df))
if st.session_state.count < len(df):
st.write(df.iloc[st.session_state.count]['Text'])
label = st.selectbox("Classification:", ["HOF", "NOT","Not Sure"])
submitted = st.form_submit_button("Submit")
if submitted:
st.session_state.label_list.append(label)
st.session_state.count += 1
else:
st.write("All done!")
submitted = st.form_submit_button("Submit")
if submitted:
st.write(st.session_state.label_list)

the DataFrameClinet class of python's package for influxdb uploads only the last line from the dataframe

i am trying to use python's package for influxdb to upload dataframe into the database
i am using the write_points class to write point into the database as given in the documentation(https://influxdb-python.readthedocs.io/en/latest/api-documentation.html)
every time i try to use the class it only updates the last line of the dataframe instead of the complete dataframe.
is this a usual behavior or there is some problem here?
given below is my script:
from influxdb import InfluxDBClient, DataFrameClient
import pathlib
import numpy as np
import pandas as pd
import datetime
db_client = DataFrameClient('dbserver', port, 'username', 'password', 'database',
ssl=True, verify_ssl=True)
today = datetime.datetime.now().strftime('%Y%m%d')
path = pathlib.Path('/dir1/dir/2').glob(f'pattern_to_match*/{today}.filename.csv')
for file in path:
order_start = pd.read_csv(f'{file}')
if not order_start.empty:
order_start['data_line1'] = (order_start['col1'] - \
order_start['col2'])*1000
order_start['data_line2'] = (order_start['col3'] - \
order_start['col4'])*1000
d1 = round(order_start['data_line1'].quantile(np.arange(0,1.1,0.1)), 3)
d2 = round(order_start['data_line2'].quantile(np.arange(0,1.1,0.1)), 3)
out_file = pd.DataFrame()
out_file = out_file.append(d1)
out_file = out_file.append(d2)
out_file = out_file.T
out_file.index = out_file.index.set_names(['percentile'])
out_file = out_file.reset_index()
out_file['percentile'] = out_file.percentile.apply(lambda x: f'{100*x:.0f}%')
out_file['tag_col'] = str(file).split('/')[2]
out_file['time'] = pd.to_datetime('today').strftime('%Y%m%d')
out_file = out_file.set_index('time')
out_file.index = pd.to_datetime(out_file.index)
db_client.write_points(out_file, 'measurement', database='database',
retention_policy='rp')
can anyone please help?

Resampling Live Websocket Ticks to Candles using Pandas in python

I am trying to resample live ticks from KiteTicker websocket into OHLC candles using pandas and this is the code I have written, which works fine with single instrument (The commented trd_portfolio on line 9) but doesn't work with multiple instruments (Line 8) as it mixes up data of different instruments.
Is there any way to relate the final candles df to instrument tokens? or make this work with multiple intruments?
I would like to run my algo on multiple instruments at once, please suggest if there is a better way around it.
from kiteconnect import KiteTicker;
from kiteconnect import KiteConnect;
import logging
import time,os,datetime,math;
import winsound
import pandas as pd
trd_portfolio = {954883:"USDINR19MARFUT",4632577:"JUBLFOOD"}
# trd_portfolio = {954883:"USDINR19MARFUT"}
trd_tkn1 = [];
for x in trd_portfolio:
trd_tkn1.append(x)
c_id = '****************'
ak = '************'
asecret = '*************************'
kite = KiteConnect(api_key=ak)
print('[*] Generate access Token : ',kite.login_url())
request_tkn = input('[*] Enter Your Request Token Here : ')[-32:];
data = kite.generate_session(request_tkn, api_secret=asecret)
kite.set_access_token(data['access_token'])
kws = KiteTicker(ak, data['access_token'])
#columns in data frame
df_cols = ["Timestamp", "Token", "LTP"]
data_frame = pd.DataFrame(data=[],columns=df_cols, index=[])
def on_ticks(ws, ticks):
global data_frame, df_cols
data = dict()
for company_data in ticks:
token = company_data["instrument_token"]
ltp = company_data["last_price"]
timestamp = company_data['timestamp']
data[timestamp] = [timestamp, token, ltp]
tick_df = pd.DataFrame(data.values(), columns=df_cols, index=data.keys()) #
data_frame = data_frame.append(tick_df)
ggframe=data_frame.set_index(['Timestamp'],['Token'])
print ggframe
gticks=ggframe.ix[:,['LTP']]
candles=gticks['LTP'].resample('1min').ohlc().dropna()
print candles
def on_connect(kws , response):
print('Connected')
kws.subscribe(trd_tkn1)
kws.set_mode(kws.MODE_FULL, trd_tkn1)
def on_close(ws, code, reason):
print('Connection Error')
kws.on_ticks = on_ticks
kws.on_connect = on_connect
kws.on_close = on_close
kws.connect()
I don't have access to the Kite API, but I've been looking at some code snippets that use it trying to figure out another issue I'm having related to websockets. I came across this open question, and I think I can help, though I can't really test this solution.
The problem I think is that you're not calculating OHLC for each "token"... it just does it for all tokens.
data_frame = data_frame.append(tick_df)
ggframe=data_frame.set_index('Timestamp')
candles=ggframe.groupby('token').resample('1min').agg({'LTP':'ohlc'})
You'll get a multi-index output, but the column names might not quite line up for the rest of your code. To fix that:
candles.columns=['open','high','low','close']

Overlap graphs with figurefactory

I am trying to plot some finance graphs and I used the plotly libraries for it.
I am quite new to this, so had to look for a way to do it, and wanted to do it offline, so I found this way:
import pandas as pd
pd.core.common.is_list_like = pd.api.types.is_list_like
from pandas_datareader import data as web
import datetime
import fix_yahoo_finance as yf
import plotly
from plotly import figure_factory as ff
import os
yf.pdr_override()
class Stock:
#We need a constructor to get all the data when we create the class
def __init__(self, name): #This needs a string as argument to work
self.name = name
#If we already have the sotcks data we don't download it again
if os.path.exists('stock_prices_yahoo/' + name + '.csv'):
self.df = pd.read_csv('stock_prices_yahoo/' + name + '.csv')
else:
start_date = '1950-01-01'
end_date = datetime.date.today()
df = web.get_data_yahoo(name, start_date, end_date)
df.to_csv('stock_prices_yahoo/' + name + '.csv')
def plot_stock(self): #This will print a plot of the stock prices
fig = ff.create_candlestick(self.df.Open, self.df.High, self.df.Low, self.df.Close, dates = self.df.Date)
fig['layout'].update({
'title': self.name + ' prices',
'yaxis': {'title': 'Price'},
'xaxis': {'title': 'Date'},
})
plotly.offline.plot(fig, filename = self.name + '-candlestick.html', validate = True)
AAPL = Stock('AAPL')
AAPL.plot_stock()
The point is I want to overlap some moving averages on it, and, after looking for it around the whole internet, peaple always do it via matplotlib.
Is there any way to do it with figure_factory?

Python / Django compare and update model objects

Iv only just started python but have learned a lot over the last few month, now I have hit a wall about updating objects on a model at a good speed.
I have a model called Products and this is populated from a csv file, every day this file get updated with changes like cost, and quantity, I can compare each line of the file with the Products Model but having 120k lines this takes 3-4hours.
What process can I take to make this process this file faster. I only want to modify the objects if cost and quantity have changed
Any suggestions how I tackle this?
Ver3 of what i have tried.
from django.core.management import BaseCommand
from multiprocessing import Pool
from django.contrib.auth.models import User
from pprint import pprint
from CentralControl.models import Product, Supplier
from CentralControl.management.helpers.map_ingram import *
from CentralControl.management.helpers.helper_generic import *
from tqdm import tqdm
from CentralControl.management.helpers.config import get_ingram
import os, sys, csv, zipfile, CentralControl
# Run Script as 'SYSTEM'
user = User.objects.get(id=1)
# Get Connection config.
SUPPLIER_CODE, FILE_LOCATION, FILE_NAME = get_ingram()
class Command(BaseCommand):
def handle(self, *args, **options):
list_in = get_file()
list_current = get_current_list()
pool = Pool(6)
pool.map(compare_lists(list_in, list_current))
pool.close()
def compare_lists(list_in, list_current):
for row_current in tqdm(list_current):
for row_in in list_in:
if row_in['order_code'] == row_current['order_code']:
#do more stuff here.
pass
def get_current_list():
try:
supplier = Supplier.objects.get(code='440040')
current_list = Product.objects.filter(supplier=supplier).values()
return current_list
except:
print('Error no products with supplier')
exit()
def get_file():
with zipfile.ZipFile(FILE_LOCATION + 'incoming/' + FILE_NAME, 'r') as zip:
with zip.open('228688 .csv') as csvfile:
reader = csv.DictReader(csvfile)
list_in = (list(reader))
for row in tqdm(list_in):
row['order_code'] = row.pop('Ingram Part Number')
row['order_code'] = (row['order_code']).lstrip("0")
row['name'] = row.pop('Ingram Part Description')
row['description'] = row.pop('Material Long Description')
row['mpn'] = row.pop('Vendor Part Number')
row['gtin'] = row.pop('EANUPC Code')
row['nett_cost'] = row.pop('Customer Price')
row['retail_price'] = row.pop('Retail Price')
row['qty_at_supplier'] = row.pop('Available Quantity')
row['backorder_date'] = row.pop('Backlog ETA')
row['backorder_date'] = (row['backorder_date'])
row['backorder_qty'] = row.pop('Backlog Information')
zip.close()
#commented out for dev precess.
#os.rename(FILE_LOCATION + 'incoming/' + FILE_NAME, FILE_LOCATION + 'processed/' + FILE_NAME)
return list_in
I have once faced a problem of slow load of data, I can tell you what i did maybe it can help you somehow, I passed the execution to debug mode and tried to find out wich colomn is causing the slow loading, and everytime i see that a colomn is causing the problem I add an index on it (in the SGBD --> postgreSQL in my case), and it worked. I hope that you are facing the same problem so my answer can help you.
Here it's rough idea:
1, when reading csv, use pandas as suggest by #BearBrow into array_csv
2, convert the obj data from Django into Numpy Arrary array_obj
3, don't compare them one by one , using numpy substraction
compare_index = (array_csv[['cost',['quantity']]] - array[['cost',['quantity']]] == 0)
4, find the updated column
obj_need_updated = array_obj[np.logic_any(compare_index['cost'], compare['quantity'])]
then use Django bulk update https://github.com/aykut/django-bulk-update to bulk update
Hope this will give you hints to speed up your code

Categories

Resources