Python Outer join - python

The below code is used to calculate statistical values.
import re
from pathlib import Path
import pandas as pd
def prepare_values(df):
df_columns = ['frame.time_delta_displayed', 'frame.len']
df_values = []
for col in df_columns:
df_values +=[
df[col].max(),
df[col].min(),
df[col].std(),
df[col].quantile(0.25),
df[col].quantile(0.5),
df[col].quantile(0.75),
df[col].mean(),
df[col].mad(),
df[col].var(),
df[col].skew(),
df[col].kurtosis(),
df[col].sum(),
]
return df_values
source_dir = Path('/media/root/HASARA/Snipping Experiment-App Activities/Time-0.5/InOutFiltered')
in_data = []
for file in source_dir.glob('**/*.in.csv'):
activity = {'activity': file.stem.split('.')[0]}
df = pd.read_csv(file)
cols =['maxTimeIn', 'minTimeIn', 'stdTimeIn', 'q1TimeIn', 'q2TimeIn', 'q3TimeIn', 'meanTimeIn', 'madTimeIn', 'varianceTimeIn', 'skewTimeIn', 'kurtosisTimeIn', 'sumTimeIn', 'maxLenIn', 'minLenIn', 'stdLenIn', 'q1LenIn','q2lenIn', 'q3LenIn', 'meanLenIn', 'madLenIn', 'varianceLenIn', 'skewLenIn', 'kurtosisLenIn', 'sumLenIn']
values = prepare_values(df)
file_data ={**activity, **dict(zip(cols,values))}
in_data.append(file_data)
out_data =[]
for file in source_dir.glob('**/*.out.csv'):
activity = {'activity': file.stem.split('.')[0]}
df = pd.read_csv(file)
cols =['maxTimeOut', 'minTimeOut', 'stdTimeOut', 'q1TimeOut', 'q2TimeOut', 'q3TimeOut', 'meanTimeOut', 'madTimeOut', 'varianceTimeOut', 'skewTimeOut', 'kurtosisTimeOut', 'sumTimeOut', 'maxLenOut', 'minLenOut', 'stdLenOut', 'q1LenOut', 'q2LenOut', 'q3LenOut', 'meanLenOut', 'madLenOut', 'varianceLenOut', 'skewLenOut', 'kurtosisLenOut','sumLenOut']
values=prepare_values(df)
file_data = {**activity, **dict(zip(cols, values))}
out_data.append(file_data)
in_df = pd.DataFrame(in_data)
out_df = pd.DataFrame(out_data)
all_df = in_df.join(out_df.set_index('activity'), on='activity', how='outer')
all_df.dropna(subset=all_df.columns.tolist()[1:], how='all', inplace=True)
all_df.fillna(0, inplace=True)
all_df['activity'] = all_df['activity'].apply(lambda x:re.sub(r'^([a-zA-Z]+).*', r'\1',x))
all_df.to_csv('/media/root/HASARA/Snipping Experiment-App Activities/Time-0.5/AllDataNew.csv', index=False)
I am getting an error. Can't figure out what it means.
Traceback (most recent call last):
File "/root/PycharmProjects/AppAct/StatisticCal.py", line 48, in <module>
all_df= in_df.join(out_df.set_index('activity'), on='activity', how='outer')
File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 4178, in set_index
level = frame[col]._values
File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'activity'

Related

"TypeError: Cannot join tz-naive with tz-aware DatetimeIndex" using yfinance

I am still new to coding and I have a problem with my code. I try several things but I don't understand why I still got all these error messages when I run my code. If someone could help me since I am on it for a long time now, it will be very nice.
You will find below my code and the error messages. Thank you.
import pandas as pd
import numpy as np
import yfinance as yf
df = yf.download('AAPL',
start='2000-01-01',
end='2010-12-31',
progress=False)
df = df.loc[:, ['Adj Close']]
df.rename(columns={'Adj Close':'adj_close'}, inplace=True)
df['simple_rtn'] = df.adj_close.pct_change()
df['log_rtn'] = np.log(df.adj_close/df.adj_close.shift(1))
QUANDL_KEY = 'LGgsMWx1VdrEv7r2R3Ve'
quandl.ApiConfig.api_key = QUANDL_KEY
df_all_dates = pd.DataFrame(index=pd.date_range(start='1999-12-31',
end='2010-12-31'))
df = df_all_dates.join(df[['adj_close']], how='left') \
.fillna(method='ffill') \
.asfreq('M')
df_cpi = quandl.get(dataset='RATEINF/CPI_USA',
start_date='1999-12-01',
end_date='2010-12-31')
df_cpi.rename(columns={'Value':'cpi'}, inplace=True)
df_merged = df.join(df_cpi, how='left')
df_merged['simple_rtn'] = df_merged.adj_close.pct_change()
df_merged['inflation_rate'] = df_merged.cpi.pct_change()
df_merged['real_rtn'] = (df_merged.simple_rtn + 1) /(df_merged.inflation_rate + 1) - 1
And the errors:
Traceback (most recent call last):
File "C:\Users\.............., line 68, in <module>
df = df_all_dates.join(df[['adj_close']], how='left') \
File "C:\Users\.............., line 9969, in join
return self._join_compat(
File "C:\Users\.............., line 10008, in _join_compat
return merge(
File "C:\Users\.............., line 125, in merge
return op.get_result(copy=copy)
File "C:\Users\.............., line 776, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "C:\Users\.............., line 1015, in _get_join_info
join_index, left_indexer, right_indexer = left_ax.join(
File "C:\Users\.............., line 317, in wrapper
return func(*args, **kwargs)
File "C:\Users\.............., line 230, in join
join_index, lidx, ridx = meth(self, other, how=how, level=level, sort=sort)
File "C:\Users\..............
raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")
TypeError: Cannot join tz-naive with tz-aware DatetimeIndex
Process finished with exit code 1

Getting Error While Mapping Data using Dictionary

I'm reading multiple files using this code block. Sometimes the column names in the file and col_map dictionary differ and code through the error. If the column name in the file is Label/Name and the col_map value is Label/, then I will throw the error. I'm looking for a wildcard kind of approach. It can name a partial value match. Then it should map the value.
If the Column name contains Label/, it should map the values.
Errors:
File "backup.py", line 27, in
mapping_function(df)
File "backup.py", line 24, in mapping_function
_data[i] = data[col_map[i]]
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.py", line 2927, in getitem
indexer = self.columns.get_loc(key)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Label/Studio/Network/Developer/Publisher'
import pandas as pd
df=pd.read_csv('test.txt',sep=' ')
print(df.columns)
### Columns Name #########
# Label/Name,Item Title,Quantity
col_map = {
"start_date":None,
"end_date":None,
"product_label":"Label/",
"product_title":"Item Title",
"product_sku":None,
"quantity":"Quantity"
}
def mapping_function(data):
_data = {}
for i in col_map:
if col_map[i] is not None:
_data[i] = data[col_map[i]]
mapping_function(df)

builtin keyerror while using pandas datareader to extract data

I'm using a loop to extract data by using pandas datareader, the first two loops are working properly.
But from the third loop, the code starts to return a builtin keyerror which is unexpected. i wonder since the first two loops are working properly, why from the third loop it starts to return error? and how to fix it?
import pandas as pd
import datetime as dt
import pandas_datareader as web
#====================================================
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
#############
prev=15
endDate=dt.datetime.today().date()
sDate=endDate-pd.to_timedelta(prev,unit='d')
#############
def get_price(tickers): #input is a list or Series
result=pd.DataFrame()
for i in tickers:
df=pd.DataFrame()
df['Adj Close']=web.DataReader(i,'yahoo',sDate,endDate)['Adj Close']
df['MA']=df['Adj Close'].rolling(5).mean()
df.sort_values(ascending=False,inplace=True,by="Date")
df['Higher?']=df['Adj Close']>df['MA']
df['Higher?']=df['Higher?'].astype(int)
result['{}'.format(i)]=df['Higher?']
return result
#=============================================================================
base_url = "http://www.sectorspdr.com/sectorspdr/IDCO.Client.Spdrs.Holdings/Export/ExportExcel?symbol="
data = {
'Ticker' : [ 'XLC','XLY','XLP','XLE','XLF','XLV','XLI','XLB','XLRE','XLK','XLU' ]
, 'Name' : [ 'Communication Services','Consumer Discretionary','Consumer Staples','Energy','Financials','Health Care','Industrials','Materials','Real Estate','Technology','Utilities' ]
}
spdr_df = pd.DataFrame(data)
print(spdr_df)
for i, row in spdr_df.iterrows():
url = base_url + row['Ticker']
df_url = pd.read_excel(url)
header = df_url.iloc[0]
holdings_df = df_url[1:]
holdings_df.set_axis(header, axis='columns', inplace=True)
holdings_df=holdings_df['Symbol'].replace('.','-')
a=get_price(holdings_df)
print(a)
the errors are listed below:
a=get_price(holdings_df)
File "C:/Users/austi/Desktop/stock&trading/get etf holdings Main Version.py", line 25, in <module>
df['Adj Close']=web.DataReader(i,'yahoo',sDate,endDate)['Adj Close']
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\util\_decorators.py", line 214, in wrapper
return func(*args, **kwargs)
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas_datareader\data.py", line 387, in DataReader
session=session,
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas_datareader\base.py", line 251, in read
df = self._read_one_data(self.url, params=self._get_params(self.symbols))
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas_datareader\yahoo\daily.py", line 165, in _read_one_data
prices["Date"] = to_datetime(to_datetime(prices["Date"], unit="s").dt.date)
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\core\frame.py", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\core\indexes\base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\_libs\index.cp36-win32.pyd", line 111, in pandas._libs.index.IndexEngine.get_loc
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\_libs\index.cp36-win32.pyd", line 138, in pandas._libs.index.IndexEngine.get_loc
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\_libs\hashtable.cp36-win32.pyd", line 1619, in pandas._libs.hashtable.PyObjectHashTable.get_item
Array of values of which unique will be calculated
File "C:\Users\austi\Downloads\Python 3.6.3\Lib\site-packages\pandas\_libs\hashtable.cp36-win32.pyd", line 1627, in pandas._libs.hashtable.PyObjectHashTable.get_item
builtins.KeyError: 'Date'```
For same tickers, there is no date column.
To catch the error and continue, try this code:
def get_price(tickers): #input is a list or Series
result=pd.DataFrame()
for i in tickers:
try:
df=pd.DataFrame()
df['Adj Close']=web.DataReader(i,'yahoo',sDate,endDate)['Adj Close']
df['MA']=df['Adj Close'].rolling(5).mean()
df.sort_values(ascending=False,inplace=True,by="Date") # sometimes error
df['Higher?']=df['Adj Close']>df['MA']
df['Higher?']=df['Higher?'].astype(int)
result['{}'.format(i)]=df['Higher?']
except Exception as ex: # no date column
print('Ticker', i, 'ERROR', ex)
print(df)
return result

Pandas datareader failure

I want to get all the stocks from sp500 to a folder in csv format.
Now while scanning the sp500 everything works great but it seems to be that in some cases the index referred to date is missing because stock doesn't exist or has no date for a specific time, whatever I tried to change startdate and enddate but no effect - in en earlier post I was said to filter those dates with an exception but due to python is new land for me I was like an alien... is there someone who can help me?
If this error occurs:
/home/mu351i/PycharmProjects/untitled/venv/bin/python /home/mu351i/PycharmProjects/untitled/get_sp500_beautifulsoup_intro.py
Traceback (most recent call last):
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2897, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 107, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1607, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1614, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Date'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/mu351i/PycharmProjects/untitled/get_sp500_beautifulsoup_intro.py", line 44, in get_data_from_yahoo
df = web.DataReader (ticker, 'yahoo', start, end)
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas/util/_decorators.py", line 208, in wrapper
return func(*args, **kwargs)
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas_datareader/data.py", line 387, in DataReader
session=session,
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas_datareader/base.py", line 251, in read
df = self._read_one_data(self.url, params=self._get_params(self.symbols))
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas_datareader/yahoo/daily.py", line 165, in _read_one_data
prices["Date"] = to_datetime(to_datetime(prices["Date"], unit="s").dt.date)
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 2995, in getitem
indexer = self.columns.get_loc(key)
File "/home/mu351i/PycharmProjects/untitled/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2899, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 107, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1607, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1614, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Date'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/mu351i/PycharmProjects/untitled/get_sp500_beautifulsoup_intro.py", line 57, in
get_data_from_yahoo()
File "/home/mu351i/PycharmProjects/untitled/get_sp500_beautifulsoup_intro.py", line 48, in get_data_from_yahoo
except RemoteDataError:
NameError: name 'RemoteDataError' is not defined
Process finished with exit code 1
how would you avoid this by changing this code?
import datetime as dt
import os
import pickle
import bs4 as bs
import pandas_datareader.data as web
import requests
def safe_sp500_tickers():
resp = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs.BeautifulSoup(resp.text,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})
tickers = []
for row in table.findAll('tr')[1:]:
ticker=row.findAll('td')[0].text.strip()
tickers.append(ticker)
with open('sp500tickers.pickle','wb') as f:
pickle.dump(tickers,f)
return tickers
safe_sp500_tickers()
def get_data_from_yahoo(reload_sp500=False):
if reload_sp500:
tickers=safe_sp500_tickers()
else:
with open('sp500tickers.pickle', 'rb') as f:
tickers = pickle.load(f)
if not os.path.exists('stock_dfs'):
os.makedirs('stock_dfs')
start = dt.datetime(1999,1,1)
end = dt.datetime(2019,12,19)
for ticker in tickers:
try:
if not os.path.exists ('stock_dfs/{}.csv'.format (ticker)):
df = web.DataReader (ticker, 'yahoo', start, end)
df.to_csv ('stock_dfs/{}.csv'.format (ticker))
else:
print ("Ticker from {} already availablle".format (ticker))
except RemoteDataError:
print ("No information for ticker '%s'" % i)
continue
except KeyError:
print("no Date for Ticker: " +ticker )
continue
get_data_from_yahoo()
A Commentator asked for some DATA Sample, well this is DATA form TSLA.csv
Date,High,Low,Open,Close,Volume,Adj Close
2010-06-29,25.0,17.540000915527344,19.0,23.889999389648438,18766300,23.889999389648438
2010-06-30,30.420000076293945,23.299999237060547,25.790000915527344,23.829999923706055,17187100,23.829999923706055
2010-07-01,25.920000076293945,20.270000457763672,25.0,21.959999084472656,8218800,21.959999084472656
2010-07-02,23.100000381469727,18.709999084472656,23.0,19.200000762939453,5139800,19.200000762939453
2010-07-06,20.0,15.829999923706055,20.0,16.110000610351562,6866900,16.110000610351562
2010-07-07,16.6299991607666,14.979999542236328,16.399999618530273,15.800000190734863,6921700,15.800000190734863
2010-07-08,17.520000457763672,15.569999694824219,16.139999389648438,17.459999084472656,7711400,17.459999084472656
2010-07-09,17.899999618530273,16.549999237060547,17.579999923706055,17.399999618530273,4050600,17.399999618530273
2010-07-12,18.06999969482422,17.0,17.950000762939453,17.049999237060547,2202500,17.049999237060547
2010-07-13,18.639999389648438,16.899999618530273,17.389999389648438,18.139999389648438,2680100,18.139999389648438
2010-07-14,20.149999618530273,17.760000228881836,17.940000534057617,19.84000015258789,4195200,19.84000015258789
2010-07-15,21.5,19.0,19.940000534057617,19.889999389648438,3739800,19.889999389648438
2010-07-16,21.299999237060547,20.049999237060547,20.700000762939453,20.639999389648438,2621300,20.639999389648438
Please provide constructive feedback because I'new here.
Thanks :)
You are missing an import
Add the following import at the top of your script
from pandas_datareader._utils import RemoteDataError
import pandas as pd
df = pd.read_html(
"https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0]
sort = pd.DataFrame(df).sort_values(by=['Date first added'])
sort['Date first added'] = pd.to_datetime(sort['Date first added'])
start_date = '1-1-1999'
end_date = '11-12-2019'
mask = (sort['Date first added'] > start_date) & (
sort['Date first added'] <= end_date)
sort = sort.loc[mask]
pd.DataFrame(sort).to_csv('result.csv', index=False)
Output: View Online
ScreenShot:

Process many csv files in a loop and extract rows from non-empty cells of a specific column using Python

I am made a code to process many csv files. For each one of them, I want to extract all rows corresponding to non-empty cells of a column called "20201-2.0". Have a look in the attached example (this is column LCE):
https://uoe-my.sharepoint.com/personal/gpapanas_ed_ac_uk/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Fgpapanas%5Fed%5Fac%5Fuk%2FDocuments%2FCSV%20File%20screenshot%2EPNG&parent=%2Fpersonal%2Fgpapanas%5Fed%5Fac%5Fuk%2FDocuments&originalPath=aHR0cHM6Ly91b2UtbXkuc2hhcmVwb2ludC5jb20vOmk6L2cvcGVyc29uYWwvZ3BhcGFuYXNfZWRfYWNfdWsvRWF5QmJsRlRIbVZKdlJmc0I2aDhWcjRCMDlJZmpRMkwxSTVPUUtVTjJwNXd6dz9ydGltZT10V2Y0c2Q1UzEwZw
I made the following code to perform this:
import pandas as pd
import glob
import os
path = './'
#column = ['20201-2.0']
all_files = glob.glob(path + "/*.csv")
for filename in all_files:
# Option 1 below worked, although without isolating the non-nulled values
# 1. df = pd.read_csv(filename, encoding="ISO-8859-1")
df = pd.read_csv(filename, header = 0)
df = df[df['20201-2.0'].notnull()]
print('extracting info from cvs...')
print(df)
# You can now export all outcomes in new csv files
file_name = filename + 'new' + '.csv'
save_path = os.path.abspath(
os.path.join(
path, file_name
)
)
print('saving ...')
export_csv = df.to_csv(save_path, index=None)
del df
del export_csv
However, although I manage to generate the first file, I get the following error:
Traceback (most recent call last):
File "/home/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2657, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: '20201-2.0'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/PycharmProjects/OPTIMAT/Read_MR_from_all_csv.py", line 21, in <module>
df = df[df['20201-2.0'].notnull()]
File "/home/giorgos/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "/home/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: '20201-2.0'
I can't understand why this is happening. Any ideas would be greatly appreciated.
Happy to say that I found a way to do this:
import pandas as pd
import glob
import os
import numpy as np
path = './'
#column = ['20201-2.0']
# all_files = glob.glob(path + "/*.csv")
#li = []
all_files = os.listdir(path)
all_df = pd.DataFrame()
for filename in all_files:
if not filename.endswith('csv'):
continue
print('extracting info from ' + filename)
# Option 1 below worked, although without isolating the non-nulled values
# 1. df = pd.read_csv(filename, encoding="ISO-8859-1")
df = pd.read_csv(filename, header=0)
#df = df[df['20201-2.0'].notnull()]
df_subset = df.dropna(subset=['20201-2.0'])
print('processed ' + filename)
# You can now export all outcomes in new csv files
file_name = filename.split('.')[0] + '_new' + '.csv'
print('saving to' + file_name)
export_csv = df_subset.to_csv('./' + file_name, index=None)
del df
del export_csv

Categories

Resources