Problem: I am trying to extract data through an API Service. A single request can take anywhere from 3 to 10 seconds. There are roughly 20,000 rows of data from a Pandas DataFrame to input into the API Call. I have managed to speed it up a bit through multiprocessing, but it's still running very slow. Any suggestions?
Code:
def scored_card_features2(source, n_batches):
"""Multiprocessing version of Scored Card Features Function
Returns reason for rating
"""
# read in source data and convert to list of lists for inputs
data = pd.read_excel(source)
data = data[['primary_bank_report_id', 'primary_tu_credit_report_id', 'purpose']]
inputs = data.values.tolist()
def scored_card_map(i):
"""form request to scored card service and retrieve values"""
url = "url/FourthGen?bank_report_id=%s&credit_report_id=%s&" \
"&loan_purpose=%s" % (i[0], i[1], i[2].replace(" ", "%20"))
r = requests.get(url)
try:
d = json.loads(r.text)
l = [d['probability_of_default'],
d['condition'],
d['purpose_of_loan'],
d['rating'],
d['bank_report_id'],
d['reason_for_rating'],
d['credit_report_id']]
return l
except:
l = [np.nan] * 7
return l
# inititate multithreading
with Pool(n_batches) as p:
vals = p.map(scored_card_map, inputs)
result = pd.DataFrame(vals, columns=['Probability of Default', 'Condition', 'Purpose of Loan', 'Rating', 'Bank Report ID',
'Reason for Rating', 'Credit Report ID'])
result = result.dropna(how='all')
return result
if __name__ == '__main__':
# model features
start = time.time()
df = scored_card_features2('BankCreditPortalIDsPurpose.xlsx', multiprocessing.cpu_count()-1)
df.to_csv('scored_card_features.csv', index=False)
end = time.time()
print(end-start)
Related
I have a python code which works for doing data analytics from csv file. I want to run my python code to be run periodically on a docker container. Every 15 seconds, it should automatically look at a folder A, if there is a csv file in it, it should process it and put an html report with the same name in folder B.
HERE IS MY PYTHON CODE .
#This program pulls data from csv file and displays it as html file.
#csv file contains device names, card names and temperatures of cards
#The html file contains: how many devices, how many cards are in the system, which
#device has the highest temperature card, and in the table below is how many cards are
#there in summary for each device, how many have a temperature of 70 and above, the
#highest and average card what are the temperatures
#NOTE: The print functions in the program are written for trial purposes.
from enum import unique
from re import A, T
import pandas as pd
from prettytable import PrettyTable, PLAIN_COLUMNS
table = PrettyTable() #create a table for device
table2 = PrettyTable() #create a table for summary
table.field_names = ["Device -", "Total # of Cards - ", "High Temp. Cards # - ", "Max Temperature - ", "Avg. Temperature "]
table2.field_names = [" "," "]
df = pd.read_csv("cards.csv", sep=';', usecols = ['Device','Card','Temperature'])""", index_col=["Device","Card"]"""
print(type(df))
print(df["Device"].nunique(),"\n\n") # number of unique server
total_devices = df["Device"].nunique() # NUMBER OF DEVICES IN DIFFERENT TYPES
print(total_devices)
print(df["Device"].loc[1],"\n\n")
print(df['Temperature'].max(),"\n\n")
maxTemp = df['Temperature'].max() #finding max temperature
print("total card ", )
i= 0
j=1
#Finding the card with the max temperature and the server where the card is located
while j > 0:
if df["Temperature"].loc[i] == df["Temperature"].max():
print(df["Device"].loc[i])
print(df["Card"].loc[i])
deviceName = df["Device"].loc[i]
cardName = df["Card"].loc[i]
j= 0
else :
i = i+1
dev_types = df["Device"].unique() # Server's names
print("\n\n")
newstr = cardName + "/" + deviceName
#Summary tablosunu olusturma
table2.add_row(["Total Devices ", total_devices] )
table2.add_row(["Total Cads ", len(df["Card"])])
table2.add_row(["Max Card Temperature ", df["Temperature"].max()])
table2.add_row(["Hottest Card / Device " ,newstr])
print(table2)
row_num = len(df)
print(row_num)
#I pulled the data from the file according to the device type so that the server cards and temperatures were sorted, I found the max temp from here
dn = pd.read_csv("cards.csv", sep=';', index_col=["Device"], usecols = ['Device','Card','Temperature'])
sum = []
high = []
#print("max temp: ", dn["Temperature"].loc[dev_types[1]].max())
for x in range(total_devices): # total devices (according the file = 3 )
print("\n")
cardCount = 0 # counts the number of cards belonging to the device
count2 = 0 # Counts the number of cards with a temperature greater than 70
tempcount = 0
print(dev_types[x])
for y in range(row_num):
if dev_types[x] == df["Device"].loc[y]:
print(df["Temperature"].loc[y])
tempcount = tempcount + df["Temperature"].loc[y] # the sum of the temperatures of the cards(used when calculating the average)
cardCount = cardCount +1
if df["Temperature"].loc[y] >= 70:
count2 = count2 +1
maxT = dn["Temperature"].loc[dev_types[x]].max() #Finding the ones with the max temperature from the cards belonging to the server
avg = str(tempcount/cardCount)
print("avg",avg)
table.add_row([dev_types[x], cardCount, count2, maxT,avg ]) # I added the information to the "devices" table
print("num of cards" , cardCount)
print("high temp cards" , count2)
print("\n\n")
print("\n\n")
print(table)
htmlCode = table.get_html_string()
htmlCode2 = table2.get_html_string()
f= open('devices.html', 'w')
f.write("SUMMARY")
f.write(htmlCode2)
f.write("DEVICES")
f.write(htmlCode)
Whether or not the code is run in Docker doesn't matter.
Wrap all of that current logic (well, not the imports and so on) in a function, say, def process_cards().
Call that function forever, in a loop:
import logging
def process_cards():
table = PrettyTable()
...
def main():
logging.basicConfig()
while True:
try:
process_cards()
except Exception:
logging.exception("Failed processing")
time.sleep(15)
if __name__ == "__main__":
main()
As an aside, your data processing code can be vastly simplified:
import pandas as pd
from prettytable import PrettyTable
def get_summary_table(df):
summary_table = PrettyTable() # create a table for summary
total_devices = df["Device"].nunique()
hottest_card = df.loc[df["Temperature"].idxmax()]
hottest_device_desc = f"{hottest_card.Card}/{hottest_card.Device}"
summary_table.add_row(["Total Devices", total_devices])
summary_table.add_row(["Total Cards", len(df["Card"])])
summary_table.add_row(["Max Card Temperature", df["Temperature"].max()])
summary_table.add_row(["Hottest Card / Device ", hottest_device_desc])
return summary_table
def get_devices_table(df):
devices_table = PrettyTable(
[
"Device",
"Total # of Cards",
"High Temp. Cards #",
"Max Temperature",
"Avg. Temperature",
]
)
for device_name, group in df.groupby("Device"):
count = len(group)
avg_temp = group["Temperature"].mean()
max_temp = group["Temperature"].max()
high_count = group[group.Temperature >= 70]["Temperature"].count()
print(f"{device_name=} {avg_temp=} {max_temp=} {high_count=}")
devices_table.add_row([device_name, count, high_count, max_temp, avg_temp])
return devices_table
def do_processing(csv_file="cards.csv", html_file="devices.html"):
# df = pd.read_csv(csv_file, sep=';', usecols=['Device', 'Card', 'Temperature'])
# (Just some random example data)
df = pd.DataFrame({
"Device": [f"Device {1 + x // 3}" for x in range(10)],
"Card": [f"Card {x + 1}" for x in range(10)],
"Temperature": [59.3, 77.2, 48.5, 60.1, 77.2, 61.1, 77.4, 65.8, 71.2, 60.3],
})
summary_table = get_summary_table(df)
devices_table = get_devices_table(df)
with open(html_file, "w") as f:
f.write(
"<style>table, th, td {border: 1px solid black; border-collapse: collapse;}</style>"
)
f.write("SUMMARY")
f.write(summary_table.get_html_string(header=False))
f.write("DEVICES")
f.write(devices_table.get_html_string())
do_processing()
i have an example of repeat decorator for run your function every seconds or minutes ...
i hope this sample helps you
from typing import Optional, Callable, Awaitable
import asyncio
from functools import wraps
def repeat_every(*, seconds: float, wait_first: bool = False)-> Callable:
def decorator(function: Callable[[], Optional[Awaitable[None]]]):
is_coroutine = asyncio.iscoroutinefunction(function)
#wraps(function)
async def wrapped():
async def loop():
if wait_first:
await asyncio.sleep(seconds)
while True:
try:
if is_coroutine:
await function()
else:
await asyncio.run_in_threadpool(function)
except Exception as e:
raise e
await asyncio.sleep(seconds)
asyncio.create_task(loop())
return wrapped
print("Repeat every working well.")
return decorator
#repeat_every(seconds=2)
async def main():
print(2*2)
try:
loop = asyncio.get_running_loop()
except RuntimeError:
loop = None
if loop and loop.is_running():
print('Async event loop already running.')
tsk = loop.create_task(main())
tsk.add_done_callback(
lambda t: print(f'Task done with result= {t.result()}'))
else:
print('Starting new event loop')
asyncio.run(main())
and there is an option that you can make an entrypoint which has cronjob
I am using PRAW to get data from Reddit and created this function to do so on multiple subreddits.
It works, however, I am working on a more concise/pythonic version but can't figure out how I can create a single "for loop", doing the job of the 3 below.
subs = r.subreddit('Futurology+wallstreetbets+DataIsBeautiful+RenewableEnergy+Bitcoin')
#This function aim to scrap data from a list of subreddit.
#From these subreddit, I would like to get the #new, #hot and #rising posts
def get_data(size_new, size_hot, size_rising, subs_number):
posts = []
followers = []
targeted_date = '14-11-20 12:00:00'
targeted_date = datetime.datetime.strptime(targeted_date, '%d-%m-%y %H:%M:%S')
#getting x new posts
for subreddit in subs.new(limit = size_new):
date = subreddit.created
date = datetime.datetime.fromtimestamp(date)
if date >= targeted_date:
posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext])
#getting x hot posts
for subreddit in subs.hot(limit = size_hot):
date = subreddit.created
date = datetime.datetime.fromtimestamp(date)
if date >= targeted_date:
posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext])
#getting x rising posts
for subreddit in subs.rising(limit = size_rising):
date = subreddit.created
date = datetime.datetime.fromtimestamp(date)
if date >= targeted_date:
posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext])
#getting subreddit subscribers number
for sub_name in subs_2:
for submission in r.subreddit(sub_name).hot(limit = 1):
followers.append([submission.subreddit, r.subreddit(sub_name).subscribers])
#creating 2 df
df_1 = pd.DataFrame(followers, columns = ['subreddit','subscribers'])
df = pd.DataFrame(posts, columns = ['date', 'subreddit', 'title', 'text']).drop_duplicates().sort_values(by = ['date']).reset_index(drop = True)
#concat the 2 df together
df = df.join(df_1.set_index('subreddit'), on = 'subreddit')
df = df[["date", "subreddit", "subscribers", "title", 'text']]
df = df[df.subscribers > subs_number].reset_index(drop = True)
return df
My request: how could it be more concise/optimized? What methodology are you using to make your code more readable or even better, optimize it for run time/computational resources?
Thank you
There are various principles to make better code, and various tools to use to find the 'code smells' that may be lurking in your code.
DRY - Don't Repeat Yourself
KISS - keep it stupid simple
SOLID
etc...
Taking a dive into the code that you posted using some of the principles on a surface level would refactor some of your code into looking like:
subs = r.subreddit('Futurology+wallstreetbets+DataIsBeautiful+RenewableEnergy+Bitcoin')
# check that the date is greater than the target date
# return true/false
def check_date(subreddit, targeted_date):
return subreddit.created >= targeted_date:
# get specific post data
def get_post_data(subreddit):
return [subreddit.created, subreddit.subreddit, subreddit.title, subreddit.selftext]
# get posts by sort type
def get_subreddit_post_types(subreddit_sort, targeted_date):
return [get_post_data(subreddit) for subreddit in subreddit_sort if check_date(subreddit, targeted_date)]
#This function aim to scrap data from a list of subreddit.
#From these subreddit, I would like to get the #new, #hot and #rising posts
def get_data(size_new, size_hot, size_rising, subs_number):
targeted_date = '14-11-20 12:00:00'
targeted_date = datetime.datetime.strptime(targeted_date, '%d-%m-%y %H:%M:%S').timestamp()
posts = []
followers = []
#getting x new posts
posts.extend(get_subreddit_post_types(subs.new(limit = size_new), targeted_date))
#getting x hot posts
posts.extend(get_subreddit_post_types(subs.hot(limit = size_hot), targeted_date))
#getting x rising posts
posts.extend(get_subreddit_post_types(subs.rising(limit = size_rising), targeted_date))
#getting subreddit subscribers number
for sub_name in subs_2:
for submission in r.subreddit(sub_name).hot(limit = 1):
followers.append([submission.subreddit, r.subreddit(sub_name).subscribers])
#creating 2 df
df_1 = pd.DataFrame(followers, columns = ['subreddit','subscribers'])
df = pd.DataFrame(posts, columns = ['date', 'subreddit', 'title', 'text']).drop_duplicates().sort_values(by = ['date']).reset_index(drop = True)
#concat the 2 df together
df = df.join(df_1.set_index('subreddit'), on = 'subreddit')
df = df[["date", "subreddit", "subscribers", "title", 'text']]
df = df[df.subscribers > subs_number].reset_index(drop = True)
return df
As for better optimizing your computational resources (what are you trying to optimize memory or runtime)? The same process applies to either is to look at your code to see what can be changed to decrease one versus the other.
From looking at your code something that would generally optimize what you wrote would be to look at what are the 'duplicate' posts that you are getting. If you could remove the duplicate check (as each of the hot/rising/new get posts from similar date ranges, but hot/rising may be completely encompassed inside of new) call from the posts that you gathered, so that you don't have to check that they are different, and possibly remove hot/rising calls (because those posts may be encompassed in new).
I'm trying to create a dataset of all my saved tracks on spotify with its metadata. I've gotten all of the song features, the track name and track id. I want to add a column of the track's artist and one of the genre.
I tried adding it by the "liked_tracks.extend" but i couldn't get it to work.
cid =""
secret = ""
redirect_uri = 'http://localhost:8000/callback'
FEATURE_KEYS = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
OFFSET=0
SAVED_TRACKS_LIMIT=50
FEATURE_LIMIT = 100
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=cid,
client_secret=secret,
redirect_uri=redirect_uri,
scope="user-library-read"))
liked_tracks = list()
print(liked_tracks)
while(True):
paged_tracks = sp.current_user_saved_tracks(offset=OFFSET, limit=SAVED_TRACKS_LIMIT)
liked_tracks.extend([{'name':el['track']['name'],
'id':el['track']['id']} for el in paged_tracks['items']])
print(f'Fetched {len(liked_tracks)} tracks')
OFFSET+=SAVED_TRACKS_LIMIT
if paged_tracks['next'] is None:
break
def get_windowed_track_ids(liked_tracks, limit):
for i in range(0, len(liked_tracks), limit):
track_window = liked_tracks[i:i + limit]
yield track_window, [t['id'] for t in track_window]
track_feature_list = list()
print('')
for track_window, track_window_ids in get_windowed_track_ids(liked_tracks, FEATURE_LIMIT):
track_features = sp.audio_features(tracks=track_window_ids)
for index, _track in enumerate(track_window):
_track.update({k:v for k,v in track_features[index].items() if k in FEATURE_KEYS})
track_feature_list.append(_track)
print(f'Fetched features for {len(track_feature_list)} tracks')
df = pd.DataFrame.from_dict(track_feature_list)
mysavedsongs = f'liked_tracks_{int(time())}.csv'
df.to_csv(mysavedsongs, index=False)
print('')
print(f'Saved features to {mysavedsongs}')
I'm downloading historical candlestick data for multiple crypto pairs across different timeframes from the binance api, i would like to know how to sort this data according to pair and timeframe and check which pair on which timeframe executes my code, the following code is what i use to get historical data
import requests
class BinanceFuturesClient:
def __init__(self):
self.base_url = "https://fapi.binance.com"
def make_requests(self, method, endpoint, data):
if method=="GET":
response = requests.get(self.base_url + endpoint, params=data)
return response.json()
def get_symbols(self):
symbols = []
exchange_info = self.make_requests("GET", "/fapi/v1/exchangeInfo", None)
if exchange_info is not None:
for symbol in exchange_info['symbols']:
if symbol['contractType'] == 'PERPETUAL' and symbol['quoteAsset'] == 'USDT':
symbols.append(symbol['pair'])
return symbols
def initial_historical_data(self, symbol, interval):
data = dict()
data['symbol'] = symbol
data['interval'] = interval
data['limit'] = 35
raw_candle = self.make_requests("GET", "/fapi/v1/klines", data)
candles = []
if raw_candle is not None:
for c in raw_candle:
candles.append(float(c[4]))
return candles[:-1]
running this code
print(binance.initial_historical_data("BTCUSDT", "5m"))
will return this as the output
[55673.63, 55568.0, 55567.89, 55646.19, 55555.0, 55514.53, 55572.46, 55663.91, 55792.83, 55649.43,
55749.98, 55680.0, 55540.25, 55470.44, 55422.01, 55350.0, 55486.56, 55452.45, 55507.03, 55390.23,
55401.39, 55478.63, 55466.48, 55584.2, 55690.03, 55760.81, 55515.57, 55698.35, 55709.78, 55760.42,
55719.71, 55887.0, 55950.0, 55980.47]
which is a list of closes
i want to loop through the code in such a manner that i can return all the close prices for the pairs and timeframes i need and sort it accordingly, i did give it a try but am just stuck at this point
period = ["1m", "3m", "5m", "15m"]
binance = BinanceFuturesClient()
symbols = binance.get_symbols()
for symbol in symbols:
for tf in period:
historical_candles = binance.initial_historical_data(symbol, tf)
# store values and run through strategy
You can use my code posted below. It requires python-binance package to be installed on your environment and API key/secret from your Binance account. Method tries to load data by weekly chunks (parameter step) and supports resending requests on failures after timeout. It may helps when you need to fetch huge amount of data.
import pandas as pd
import pytz, time, datetime
from binance.client import Client
from tqdm.notebook import tqdm
def binance_client(api_key, secret_key):
return Client(api_key=api_key, api_secret=secret_key)
def load_binance_data(client, symbol, start='1 Jan 2017 00:00:00', timeframe='1M', step='4W', timeout_sec=5):
tD = pd.Timedelta(timeframe)
now = (pd.Timestamp(datetime.datetime.now(pytz.UTC).replace(second=0)) - tD).strftime('%d %b %Y %H:%M:%S')
tlr = pd.DatetimeIndex([start]).append(pd.date_range(start, now, freq=step).append(pd.DatetimeIndex([now])))
print(f' >> Loading {symbol} {timeframe} for [{start} -> {now}]')
df = pd.DataFrame()
s = tlr[0]
for e in tqdm(tlr[1:]):
if s + tD < e:
_start, _stop = (s + tD).strftime('%d %b %Y %H:%M:%S'), e.strftime('%d %b %Y %H:%M:%S')
nerr = 0
while nerr < 3:
try:
chunk = client.get_historical_klines(symbol, timeframe.lower(), _start, _stop)
nerr = 100
except e as Exception:
nerr +=1
print(red(str(e)))
time.sleep(10)
if chunk:
data = pd.DataFrame(chunk, columns = ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'close_time', 'quote_av', 'trades', 'tb_base_av', 'tb_quote_av', 'ignore' ])
data.index = pd.to_datetime(data['timestamp'].rename('time'), unit='ms')
data = data.drop(columns=['timestamp', 'close_time']).astype(float).astype({
'ignore': bool,
'trades': int,
})
df = df.append(data)
s = e
time.sleep(timeout_sec)
return df
How to use
c = binance_client(<your API code>, <your API secret>)
# loading daily data from 1/Mar/21 till now (your can use other timerames like 1m, 5m etc)
data = load_binance_data(c, 'BTCUSDT', '2021-03-01', '1D')
It returns indexed DataFrame with loaded data:
time
open
high
low
close
volume
quote_av
trades
tb_base_av
tb_quote_av
ignore
2021-03-02 00:00:00
49595.8
50200
47047.6
48440.7
64221.1
3.12047e+09
1855583
31377
1.52515e+09
False
2021-03-03 00:00:00
48436.6
52640
48100.7
50349.4
81035.9
4.10952e+09
2242131
40955.4
2.07759e+09
False
2021-03-04 00:00:00
50349.4
51773.9
47500
48374.1
82649.7
4.07984e+09
2291936
40270
1.98796e+09
False
2021-03-05 00:00:00
48374.1
49448.9
46300
48751.7
78192.5
3.72713e+09
2054216
38318.3
1.82703e+09
False
2021-03-06 00:00:00
48746.8
49200
47070
48882.2
44399.2
2.14391e+09
1476474
21500.6
1.03837e+09
False
Next steps are up to you and dependent on how would you like to design your data structure. In simplest case you could store data into dictionaries:
from collections import defaultdict
data = defaultdict(dict)
for symbol in ['BTCUSDT', 'ETHUSDT']:
for tf in ['1d', '1w']:
historical_candles = load_binance_data(c, symbol, '2021-05-01', timeframe=tf)
# store values and run through strategy
data[symbol][tf] = historical_candles
to get access to your OHLC you just need following: data['BTCUSDT']['1d'] etc.
I have two functions, one which creates a dataframe from a csv and another which manipulates that dataframe. There is no problem the first time I pass the raw data through the lsc_age(import_data()) functions. However, I get the above-referenced error (TypeError: 'DataFrame' object is not callable) upon second+ attempts. Any ideas for how to solve the problem?
def import_data(csv,date1,date2):
global data
data = pd.read_csv(csv,header=1)
data = data.iloc[:,[0,1,4,6,7,8,9,11]]
data = data.dropna(how='all')
data = data.rename(columns={"National: For Dates 9//1//"+date1+" - 8//31//"+date2:'event','Unnamed: 1':'time','Unnamed: 4':'points',\
'Unnamed: 6':'name','Unnamed: 7':'age','Unnamed: 8':'lsc','Unnamed: 9':'club','Unnamed: 11':'date'})
data = data.reset_index().drop('index',axis=1)
data = data[data.time!='Time']
data = data[data.points!='Power ']
data = data[data['event']!="National: For Dates 9//1//"+date1+" - 8//31//"+date2]
data = data[data['event']!='USA Swimming, Inc.']
data = data.reset_index().drop('index',axis=1)
for i in range(len(data)):
if len(str(data['event'][i])) <= 3:
data['event'][i] = data['event'][i-1]
else:
data['event'][i] = data['event'][i]
data = data.dropna()
age = []
event = []
gender = []
for row in data.event:
gender.append(row.split(' ')[0])
if row[:9]=='Female 10':
n = 4
groups = row.split(' ')
age.append(' '.join(groups[1:n]))
event.append(' '.join(groups[n:]))
elif row[:7]=='Male 10':
n = 4
groups = row.split(' ')
age.append(' '.join(groups[1:n]))
event.append(' '.join(groups[n:]))
else:
n = 2
groups = row.split(' ')
event.append(' '.join(groups[n:]))
groups = row.split(' ')
age.append(groups[1])
data['age_group'] = age
data['event_simp'] = event
data['gender'] = gender
data['year'] = date2
return data
def lsc_age(data_two):
global lsc, lsc_age, top, all_performers
lsc = pd.DataFrame(data_two['event'].groupby(data_two['lsc']).count()).reset_index().sort_values(by='event',ascending=False)
lsc_age = data_two.groupby(['year','age_group','lsc'])['event'].count().reset_index().sort_values(by=['age_group','event'],ascending=False)
top = pd.concat([lsc_age[lsc_age.age_group=='10 & under'].head(),lsc_age[lsc_age.age_group=='11-12'].head(),\
lsc_age[lsc_age.age_group=='13-14'].head(),lsc_age[lsc_age.age_group=='15-16'].head(),\
lsc_age[lsc_age.age_group=='17-18'].head()],ignore_index=True)
all_performers = pd.concat([lsc_age[lsc_age.age_group=='10 & under'],lsc_age[lsc_age.age_group=='11-12'],\
lsc_age[lsc_age.age_group=='13-14'],lsc_age[lsc_age.age_group=='15-16'],\
lsc_age[lsc_age.age_group=='17-18']],ignore_index=True)
all_performers = all_performers.rename(columns={'event':'no. top 100'})
all_performers['age_year_lsc'] = all_performers.age_group+' '+all_performers.year.astype(str)+' '+all_performers.lsc
return all_performers
years = [i for i in range(2008,2018)]
for i in range(len(years)-1):
lsc_age(import_data(str(years[i+1])+"national100.csv",\
str(years[i]),str(years[i+1])))
During the first call to your function lsc_age() in line
lsc_age = data_two.groupby(['year','age_group','lsc'])['event'].count().reset_index().sort_values(by=['age_group','event'],ascending=False)
you are overwriting your function object with a dataframe. This is happening since you imported the function object from the global namespace with
global lsc, lsc_age, top, all_performers
Functions in Python are objects. Please see more information about this here.
To solve your problem, try to avoid the global imports. They do not seem to be necessary. Try to pass your data around through the arguments of the function.