I am having trouble running a script for getting counts of predictions from csv files at a given directory. The format of the csv looks like this:
Sample data
and the code is the following:
import os
from glob import glob
import pandas as pd
def get_count(distribution, keyname):
try:
count = distribution[keyname]
except KeyError:
count = 0
return count
main_path = "K:\\...\\folder_name"
folder_paths = glob("%s\\*" % main_path)
data = []
for path in folder_paths:
file_name = os.path.splitext(os.path.basename(path))[0]
results = pd.read_csv(path, error_bad_lines=False)results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
distribution = results.Predictions.value_counts()
print(distribution)
num_of_x = get_count(distribution, "x")
num_of_y = get_count(distribution,"y")
num_of_z = get_count(distribution,"z")
d = {"filename": file_name, "x": num_of_x, "y": num_of_y, "z": num_of_z}
data.append(d)
df = pd.DataFrame(data=data)
df.to_csv(os.path.join(main_path,"summary_counts.csv"), index=False)
the output error is Keyerror: "Filename" reffering to the pd.Series function, anyone would know how to solve this?
I am using Python 3.7.3 and pandas 1.0.5 and I am a beginner in programming...
Many thanks in advance
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File ".\save_counts.py", line 24, in <module>
results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
File "K:\...\lib\site-packages\pandas\core\frame.py
", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "K:\...\site-packages\pandas\core\indexes\
base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get
_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get
_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.has
htable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.has
htable.PyObjectHashTable.get_item
KeyError: 'Filename'
in here:
for path in folder_paths:
file_name = os.path.splitext(os.path.basename(path))[0]
results = pd.read_csv(path, error_bad_lines=False)
results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
you are creating pd.Series, but those values exist only inside this for loop.
if after this loop you want to use results df in distribution you need to use append()
create empty df and append results in this df
final_results = pd.Dataframe()
for path in folder_paths:
file_name = os.path.splitext(os.path.basename(path))[0]
results = pd.read_csv(path, error_bad_lines=False)
results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
final_results = final_results.append(results)
#and from this point you can continue
Related
Seeking for your assistance regarding this issue and I'm trying to resolve it, tried so many syntax but still getting the same error. I got multiple csv files to be converted and I'm pulling the same data, the script works for 1 of my csv file but not on the other. Looking forward to your feedback. Thank you very much.
My code:
import os
import pandas as pd
directory = 'C:/path'
ext = ('.csv')
for filename in os.listdir(directory):
f = os.path.join(directory, filename)
if f.endswith(ext):
head_tail = os.path.split(f)
head_tail1 = 'C:/path'
k =head_tail[1]
r=k.split(".")[0]
p=head_tail1 + "/" + r + " - Revised.csv"
mydata = pd.read_csv(f)
# to pull columns and values
new = mydata[["A","Room","C","D"]]
new = new.rename(columns={'D': 'Qty. of Parts'})
new['Qty. of Parts'] = 1
new.to_csv(p ,index=False)
#to merge columns and values
merge_columns = ['A', 'Room', 'C']
merged_col = ''.join(merge_columns).replace('ARoomC', 'F')
new[merged_col] = new[merge_columns].apply(lambda x: '.'.join(x), axis=1)
new.drop(merge_columns, axis=1, inplace=True)
new = new.groupby(merged_col).count().reset_index()
new.to_csv(p, index=False)
The error I get:
Traceback (most recent call last):
File "C:Path\MyProject.py", line 34, in <module>
new[merged_col] = new[merge_columns].apply(lambda x: '.'.join(x), axis=1)
File "C:Path\MyProject.py", line 9565, in apply
return op.apply().__finalize__(self, method="apply")
File "C:Path\MyProject.py", line 746, in apply
return self.apply_standard()
File "C:Path\MyProject.py", line 873, in apply_standard
results, res_index = self.apply_series_generator()
File "C:Path\MyProject.py", line 889, in apply_series_generator
results[i] = self.f(v)
File "C:Path\MyProject.py", line 34, in <lambda>
new[merged_col] = new[merge_columns].apply(lambda x: '.'.join(x), axis=1)
TypeError: sequence item 1: expected str instance, int found
It's hard to say what you're trying to achieve without showing a sample of your data. But anyway, to fix the error, you need to cast the values as a string with str when calling pandas.Series.apply :
new[merged_col] = new[merge_columns].apply(lambda x: '.'.join(str(x)), axis=1)
Or, you can also use pandas.Series.astype:
new[merged_col] = new[merge_columns].astype(str).apply(lambda x: '.'.join(x), axis=1)
I am getting memory error while creating dataframe. I am reading zip file from s3 and writing the Byte data into dataframe but I am getting memory error. Could you please help me how to avoid this or what changes can I do in my code?
code-
list_table = []
for table in d:
dict_table = OrderedDict()
s_time = datetime.datetime.now().strftime("%H:%M:%S")
print("start_time--->>",s_time)
print("tablename--->>", table)
s3 = boto3.resource('s3')
key='raw/vs-1/load-1619/data' +'/'+ table
obj = s3.Object('*******',key)
n = obj.get()['Body'].read()
gzipfile = BytesIO(n)
gzipfile = gzip.GzipFile(fileobj=gzipfile)
content = gzipfile.read()
#print(content)
content_str = content.decode('utf-8')
df1 = pd.DataFrame([x.split(',') for x in str(content_str).split('\n')])
#print(df1)
#count = os.popen('aws s3 cp s3://itx-agu-lake/raw/vs-1/load-1619/data/{0} - | wc -l'.format(table)).read()
count = int(len(df1)) - 2
del(df1)
e_time = datetime.datetime.now().strftime("%H:%M:%S")
print("End_time---->>",e_time)
print(count)
dict_table['Table_Name'] = str(table)
dict_table['Count'] = count
list_table.append(dict_table)
I am getting memory error in below line-
df1 = pd.DataFrame([x.split(',') for x in str(content_str).split('\n')])
Error-
Traceback (most recent call last):
File "ravi_sir.py", line 45, in <module>
df1 = pd.DataFrame([x.split(',') for x in str(content_str).split('\n')])
File "/app/python3/lib/python3.6/site-packages/pandas/core/frame.py", line 520, in __init__
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/construction.py", line 93, in arrays_to_mgr
return create_block_manager_from_arrays(arrays, arr_names, axes)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1650, in create_block_manager_from_arrays
blocks = form_blocks(arrays, names, axes)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1739, in form_blocks
object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1784, in _simple_blockify
values, placement = _stack_arrays(tuples, dtype)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1830, in _stack_arrays
stacked = np.empty(shape, dtype=dtype)
MemoryError
Does it help to utilize the Pandas series string split method?
# a sample string
content_str = 'a,b,c,d\nd,e,f,g\nh,i,j,k'
content_str = str(content_str).split('\n')
df1 = pd.DataFrame(content_str)
df1 = df1[0].str.split(',', expand=True)
Posted here instead of the comments because it isn't pretty to post code there.
Hello I have working code like this:
import pandas as pdfrom pandas.io.json import json_normalize
import json
import warnings
warnings.filterwarnings('ignore')
with open('yieldfull.json') as file:
data = json.load(file)
df_json = json_normalize(data)
df_json_stripped = data[0]
platform_dict = df_json_stripped['result']
platform_names = []
for key in platform_dict:
platform_names.append(key)
for name in platform_names:
if name == 'Autofarm':
vault_name_df = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.LPVaults.vaults'].items()]))[0])['name']
current_token_0 = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.LPVaults.vaults'].items()]))[0])['LPInfo.currentToken0']
current_token_1 = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.LPVaults.vaults'].items()]))[0])['LPInfo.currentToken1']
df_json = pd.DataFrame({'Vault_Name':vault_name_df, 'Current_Token_0':current_token_0 , 'Current_Token_1':current_token_1})
df_json.to_excel('Output_'+name+'.xlsx', index = False)
platform_names.remove(name)
elif name == 'Acryptos':
vault_name_df = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.vaults.vaults'].items()]))[0])['name']
price_USD = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.vaults.vaults'].items()]))[0])['priceInUSDDepositToken']
current_token_0 = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.vaults.vaults'].items()]))[0])['currentTokens']
deposited_token = json_normalize(pd.DataFrame(dict([(k, pd.Series(v)) for k,v in df_json['result.'+name+'.vaults.vaults'].items()]))[0])['depositedTokens']
df_json = pd.DataFrame({'Vault_Name':vault_name_df, 'Price_USD':price_USD, 'Current_Token_0':current_token_0, 'Deposited_Token':deposited_token})
df_json.to_excel('Output_'+name+'.xlsx', index = False)
else:
pass
Problem is: If I leave it like this it only outputs for first if. When I comment out that if section it will successfully output elif, but I can't get it to output 2 files whatever I do. Any ideas?
Error I'm getting for Acryptos:
Traceback (most recent call last):
File "C:\Users\Adam\PycharmProjects\Scrapy_Things\venv\lib\site-packages\pandas\core\indexes\base.py", line 3080, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 4554, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 4562, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'result.Acryptos.vaults.vaults'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:/Users/Adam/PycharmProjects/Scrapy_Things/yieldwatch/yieldwatch/spiders/JsonExcel.py", line 27, in <module>
vault_name_df = json_normalize(pd.DataFrame(dict([(k , pd.Series(v)) for k,v in df_json['result.'+name+'.vaults.vaults'].items()]))[0])['name']
File "C:\Users\Adam\PycharmProjects\Scrapy_Things\venv\lib\site-packages\pandas\core\frame.py", line 3024, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\Adam\PycharmProjects\Scrapy_Things\venv\lib\site-packages\pandas\core\indexes\base.py", line 3082, in get_loc
raise KeyError(key) from err
KeyError: 'result.Acryptos.vaults.vaults'
But if I comment out Autofarm and just process if for Acryptos is outputs excel just fine.
please remove the below line from your code
platform_names.remove(name)
debug code:
platform_names=['Autofarm','Acryptos']
for name in platform_names:
if name == 'Autofarm':
print("Autofarm")
#platform_names.remove(name) # remove this line
elif name == "Acryptos":
print("Acryptos")
you have initially created
df_json = json_normalize(data)
and also in loop, you are overwriting it -->
df_json = pd.DataFrame({'Vault_Name':vault_name_df, 'Current_Token_0':current_token_0 , 'Current_Token_1':current_token_1})
df_json.to_excel('Output_'+name+'.xlsx', index = False)
so change the name in loop and it will be okay.
I am trying to create a variable that stores a folder within the directory I am working in called TimeSeries. After that, I am trying to read each file in TimeSeries. Apparently, my error stems from df = pd.read_csv(f) being a relative path instead of an absolute path. However, I can't confirm this, as when I check isabs(direct), I get back True. I do know that the error is about that specific line, I just don't know what it is.
Code:
import pandas as pd
import numpy as np
import os
direct = os.path.abspath('TimeSeries')
for f in direct:
df = pd.read_csv(f)
df = df.replace(np.nan, 'Other', regex=True)
if df.columns[0] == ['FIPS']:
print(df.columns)
df = df.drop(['FIPS', 'Last_Update', 'Lat', 'Long_'], axis=1)
df = df.rename(columns={'Admin2': 'County',
'Province_State': 'State',
'Country_Region': 'Country',
'Combined_Key': 'City'})
df.to_csv(f)
elif df.columns[0] == ['Province/State']:
print(df.columns)
df = df.drop(['Last Update'], axis=1)
df = df.rename(columns={'Province/State': 'State',
'Country/Region': 'Country'})
df.to_csv(f)
else:
pass
Result:
Traceback (most recent call last):
File "C:/Users/USER/PycharmProjects/Corona Stats/Corona.py", line 9, in <module>
df = pd.read_csv(f)
File "C:\Users\USER\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 676, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\USER\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 448, in _read
parser = TextFileReader(fp_or_buf, **kwds)
File "C:\Users\USER\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 880, in __init__
self._make_engine(self.engine)
File "C:\Users\USER\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 1114, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "C:\Users\USER\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 1891, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas\_libs\parsers.pyx", line 374, in pandas._libs.parsers.TextReader.__cinit__
File "pandas\_libs\parsers.pyx", line 674, in pandas._libs.parsers.TextReader._setup_parser_source
FileNotFoundError: [Errno 2] File C does not exist: 'C'
Process finished with exit code 1
This is what happens when I print direct.
C:\Users\USER\PycharmProjects\Corona Stats\TimeSeries
IIUC: Try:
source = "C:/Users/USER/PycharmProjects/Corona Stats/TimeSeries"
for filename in os.listdir(source):
filepath = os.path.join(source, filename)
if not os.path.isfile(filepath):
continue
df = pd.read_csv(filepath)
df = df.replace(np.nan, 'Other', regex=True)
if df.columns[0] == 'FIPS':
print(df.columns)
df = df.drop(['FIPS', 'Last_Update', 'Lat', 'Long_'], axis=1)
df = df.rename(columns={'Admin2': 'County',
'Province_State': 'State',
'Country_Region': 'Country',
'Combined_Key': 'City'})
df.to_csv(filepath)
elif df.columns[0] == 'Province/State':
print(df.columns)
df = df.drop(['Last Update'], axis=1)
df = df.rename(columns={'Province/State': 'State',
'Country/Region': 'Country'})
df.to_csv(filepath)
With python or pandas when you use read_csv or pd.read_csv, both of them look into current working directory, by default where the python process have started. So you need to use os module to chdir() and take it from there.
import pandas as pd
import os
print(os.getcwd())
os.chdir("<PATH TO DIRECTORY>")
print(os.getcwd())
df = pd.read_csv('<The Filename You want to read>')
print(df.head())
Here you're iterating over EACH letter in the path:
direct = 'C:/Users/USER/PycharmProjects/Corona Stats/TimeSeries'
for f in direct:
...
If you want to get the files in the directory you should use something like:
for item in os.listdir(direct):
...
Personally I would use pathlib:
from pathlib import Path
direct = Path('C:/Users/USER/PycharmProjects/Corona Stats/TimeSeries')
for item in direct.glob('*'):
...
I am made a code to process many csv files. For each one of them, I want to extract all rows corresponding to non-empty cells of a column called "20201-2.0". Have a look in the attached example (this is column LCE):
https://uoe-my.sharepoint.com/personal/gpapanas_ed_ac_uk/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Fgpapanas%5Fed%5Fac%5Fuk%2FDocuments%2FCSV%20File%20screenshot%2EPNG&parent=%2Fpersonal%2Fgpapanas%5Fed%5Fac%5Fuk%2FDocuments&originalPath=aHR0cHM6Ly91b2UtbXkuc2hhcmVwb2ludC5jb20vOmk6L2cvcGVyc29uYWwvZ3BhcGFuYXNfZWRfYWNfdWsvRWF5QmJsRlRIbVZKdlJmc0I2aDhWcjRCMDlJZmpRMkwxSTVPUUtVTjJwNXd6dz9ydGltZT10V2Y0c2Q1UzEwZw
I made the following code to perform this:
import pandas as pd
import glob
import os
path = './'
#column = ['20201-2.0']
all_files = glob.glob(path + "/*.csv")
for filename in all_files:
# Option 1 below worked, although without isolating the non-nulled values
# 1. df = pd.read_csv(filename, encoding="ISO-8859-1")
df = pd.read_csv(filename, header = 0)
df = df[df['20201-2.0'].notnull()]
print('extracting info from cvs...')
print(df)
# You can now export all outcomes in new csv files
file_name = filename + 'new' + '.csv'
save_path = os.path.abspath(
os.path.join(
path, file_name
)
)
print('saving ...')
export_csv = df.to_csv(save_path, index=None)
del df
del export_csv
However, although I manage to generate the first file, I get the following error:
Traceback (most recent call last):
File "/home/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2657, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: '20201-2.0'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/PycharmProjects/OPTIMAT/Read_MR_from_all_csv.py", line 21, in <module>
df = df[df['20201-2.0'].notnull()]
File "/home/giorgos/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "/home/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: '20201-2.0'
I can't understand why this is happening. Any ideas would be greatly appreciated.
Happy to say that I found a way to do this:
import pandas as pd
import glob
import os
import numpy as np
path = './'
#column = ['20201-2.0']
# all_files = glob.glob(path + "/*.csv")
#li = []
all_files = os.listdir(path)
all_df = pd.DataFrame()
for filename in all_files:
if not filename.endswith('csv'):
continue
print('extracting info from ' + filename)
# Option 1 below worked, although without isolating the non-nulled values
# 1. df = pd.read_csv(filename, encoding="ISO-8859-1")
df = pd.read_csv(filename, header=0)
#df = df[df['20201-2.0'].notnull()]
df_subset = df.dropna(subset=['20201-2.0'])
print('processed ' + filename)
# You can now export all outcomes in new csv files
file_name = filename.split('.')[0] + '_new' + '.csv'
print('saving to' + file_name)
export_csv = df_subset.to_csv('./' + file_name, index=None)
del df
del export_csv