I am trying to create a variable that stores a folder within the directory I am working in called TimeSeries. After that, I am trying to read each file in TimeSeries. Apparently, my error stems from df = pd.read_csv(f) being a relative path instead of an absolute path. However, I can't confirm this, as when I check isabs(direct), I get back True. I do know that the error is about that specific line, I just don't know what it is.
Code:
import pandas as pd
import numpy as np
import os
direct = os.path.abspath('TimeSeries')
for f in direct:
df = pd.read_csv(f)
df = df.replace(np.nan, 'Other', regex=True)
if df.columns[0] == ['FIPS']:
print(df.columns)
df = df.drop(['FIPS', 'Last_Update', 'Lat', 'Long_'], axis=1)
df = df.rename(columns={'Admin2': 'County',
'Province_State': 'State',
'Country_Region': 'Country',
'Combined_Key': 'City'})
df.to_csv(f)
elif df.columns[0] == ['Province/State']:
print(df.columns)
df = df.drop(['Last Update'], axis=1)
df = df.rename(columns={'Province/State': 'State',
'Country/Region': 'Country'})
df.to_csv(f)
else:
pass
Result:
Traceback (most recent call last):
File "C:/Users/USER/PycharmProjects/Corona Stats/Corona.py", line 9, in <module>
df = pd.read_csv(f)
File "C:\Users\USER\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 676, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\USER\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 448, in _read
parser = TextFileReader(fp_or_buf, **kwds)
File "C:\Users\USER\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 880, in __init__
self._make_engine(self.engine)
File "C:\Users\USER\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 1114, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "C:\Users\USER\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 1891, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas\_libs\parsers.pyx", line 374, in pandas._libs.parsers.TextReader.__cinit__
File "pandas\_libs\parsers.pyx", line 674, in pandas._libs.parsers.TextReader._setup_parser_source
FileNotFoundError: [Errno 2] File C does not exist: 'C'
Process finished with exit code 1
This is what happens when I print direct.
C:\Users\USER\PycharmProjects\Corona Stats\TimeSeries
IIUC: Try:
source = "C:/Users/USER/PycharmProjects/Corona Stats/TimeSeries"
for filename in os.listdir(source):
filepath = os.path.join(source, filename)
if not os.path.isfile(filepath):
continue
df = pd.read_csv(filepath)
df = df.replace(np.nan, 'Other', regex=True)
if df.columns[0] == 'FIPS':
print(df.columns)
df = df.drop(['FIPS', 'Last_Update', 'Lat', 'Long_'], axis=1)
df = df.rename(columns={'Admin2': 'County',
'Province_State': 'State',
'Country_Region': 'Country',
'Combined_Key': 'City'})
df.to_csv(filepath)
elif df.columns[0] == 'Province/State':
print(df.columns)
df = df.drop(['Last Update'], axis=1)
df = df.rename(columns={'Province/State': 'State',
'Country/Region': 'Country'})
df.to_csv(filepath)
With python or pandas when you use read_csv or pd.read_csv, both of them look into current working directory, by default where the python process have started. So you need to use os module to chdir() and take it from there.
import pandas as pd
import os
print(os.getcwd())
os.chdir("<PATH TO DIRECTORY>")
print(os.getcwd())
df = pd.read_csv('<The Filename You want to read>')
print(df.head())
Here you're iterating over EACH letter in the path:
direct = 'C:/Users/USER/PycharmProjects/Corona Stats/TimeSeries'
for f in direct:
...
If you want to get the files in the directory you should use something like:
for item in os.listdir(direct):
...
Personally I would use pathlib:
from pathlib import Path
direct = Path('C:/Users/USER/PycharmProjects/Corona Stats/TimeSeries')
for item in direct.glob('*'):
...
Related
Seeking for your assistance regarding this issue and I'm trying to resolve it, tried so many syntax but still getting the same error. I got multiple csv files to be converted and I'm pulling the same data, the script works for 1 of my csv file but not on the other. Looking forward to your feedback. Thank you very much.
My code:
import os
import pandas as pd
directory = 'C:/path'
ext = ('.csv')
for filename in os.listdir(directory):
f = os.path.join(directory, filename)
if f.endswith(ext):
head_tail = os.path.split(f)
head_tail1 = 'C:/path'
k =head_tail[1]
r=k.split(".")[0]
p=head_tail1 + "/" + r + " - Revised.csv"
mydata = pd.read_csv(f)
# to pull columns and values
new = mydata[["A","Room","C","D"]]
new = new.rename(columns={'D': 'Qty. of Parts'})
new['Qty. of Parts'] = 1
new.to_csv(p ,index=False)
#to merge columns and values
merge_columns = ['A', 'Room', 'C']
merged_col = ''.join(merge_columns).replace('ARoomC', 'F')
new[merged_col] = new[merge_columns].apply(lambda x: '.'.join(x), axis=1)
new.drop(merge_columns, axis=1, inplace=True)
new = new.groupby(merged_col).count().reset_index()
new.to_csv(p, index=False)
The error I get:
Traceback (most recent call last):
File "C:Path\MyProject.py", line 34, in <module>
new[merged_col] = new[merge_columns].apply(lambda x: '.'.join(x), axis=1)
File "C:Path\MyProject.py", line 9565, in apply
return op.apply().__finalize__(self, method="apply")
File "C:Path\MyProject.py", line 746, in apply
return self.apply_standard()
File "C:Path\MyProject.py", line 873, in apply_standard
results, res_index = self.apply_series_generator()
File "C:Path\MyProject.py", line 889, in apply_series_generator
results[i] = self.f(v)
File "C:Path\MyProject.py", line 34, in <lambda>
new[merged_col] = new[merge_columns].apply(lambda x: '.'.join(x), axis=1)
TypeError: sequence item 1: expected str instance, int found
It's hard to say what you're trying to achieve without showing a sample of your data. But anyway, to fix the error, you need to cast the values as a string with str when calling pandas.Series.apply :
new[merged_col] = new[merge_columns].apply(lambda x: '.'.join(str(x)), axis=1)
Or, you can also use pandas.Series.astype:
new[merged_col] = new[merge_columns].astype(str).apply(lambda x: '.'.join(x), axis=1)
I am having trouble running a script for getting counts of predictions from csv files at a given directory. The format of the csv looks like this:
Sample data
and the code is the following:
import os
from glob import glob
import pandas as pd
def get_count(distribution, keyname):
try:
count = distribution[keyname]
except KeyError:
count = 0
return count
main_path = "K:\\...\\folder_name"
folder_paths = glob("%s\\*" % main_path)
data = []
for path in folder_paths:
file_name = os.path.splitext(os.path.basename(path))[0]
results = pd.read_csv(path, error_bad_lines=False)results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
distribution = results.Predictions.value_counts()
print(distribution)
num_of_x = get_count(distribution, "x")
num_of_y = get_count(distribution,"y")
num_of_z = get_count(distribution,"z")
d = {"filename": file_name, "x": num_of_x, "y": num_of_y, "z": num_of_z}
data.append(d)
df = pd.DataFrame(data=data)
df.to_csv(os.path.join(main_path,"summary_counts.csv"), index=False)
the output error is Keyerror: "Filename" reffering to the pd.Series function, anyone would know how to solve this?
I am using Python 3.7.3 and pandas 1.0.5 and I am a beginner in programming...
Many thanks in advance
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File ".\save_counts.py", line 24, in <module>
results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
File "K:\...\lib\site-packages\pandas\core\frame.py
", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "K:\...\site-packages\pandas\core\indexes\
base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get
_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get
_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.has
htable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.has
htable.PyObjectHashTable.get_item
KeyError: 'Filename'
in here:
for path in folder_paths:
file_name = os.path.splitext(os.path.basename(path))[0]
results = pd.read_csv(path, error_bad_lines=False)
results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
you are creating pd.Series, but those values exist only inside this for loop.
if after this loop you want to use results df in distribution you need to use append()
create empty df and append results in this df
final_results = pd.Dataframe()
for path in folder_paths:
file_name = os.path.splitext(os.path.basename(path))[0]
results = pd.read_csv(path, error_bad_lines=False)
results['Label'] = pd.Series(results['Filename'].str.split("\\").str[0])
final_results = final_results.append(results)
#and from this point you can continue
I have to simple question here:
First Question How to read a txt or any file without declaring the name of the file, I will make a ceratin path for example in C:\Users\mahmoud\PycharmProjects\text_files this path folder contains all the text files i want to convert So I want to create something like a for loop that looping in the path and convert all this types of files to csv to excel.
2.How to reduce the code as the below, as this code related to the first question.
import pandas as pd
dataf_gsm_car_txt = 'gsmcarrier_mnm.txt'
dataf_gsm_rel_txt = 'gsmrelation_mnm.txt'
class push_all_to_csv(object):
def push_gsmcarrier(self):
dataf_gsm_car_txt_df = pd.read_csv(dataf_gsm_car_txt, sep=';')
dataf_gsm_car_df_column_index = list(dataf_gsm_car_txt_df.columns)
dataf_gsm_car_txt_df.reset_index(inplace=True)
dataf_gsm_car_txt_df.drop(columns=dataf_gsm_car_txt_df.columns[-1], inplace=True)
dataf_gsm_car_df_column_index = dict(zip(list(dataf_gsm_car_txt_df.columns), dataf_gsm_car_df_column_index))
dataf_gsm_car_txt_df.rename(columns=dataf_gsm_car_df_column_index, inplace=True)
dataf_gsm_car_txt_df.to_excel('gsmcarrier_mnm.xlsx', 'Sheet1', index=False)
def push_gsmrelation(self):
dataf_gsm_txt_df = pd.read_csv(dataf_gsm_rel_txt, sep=';')
dataf_gsm_rel_df_column_index = list(dataf_gsm_txt_df.columns)
dataf_gsm_txt_df.reset_index(inplace=True)
dataf_gsm_txt_df.drop(columns=dataf_gsm_txt_df.columns[-1], inplace=True)
dataf_gsm_rel_df_column_index = dict(zip(list(dataf_gsm_txt_df.columns), dataf_gsm_rel_df_column_index))
dataf_gsm_txt_df.rename(columns=dataf_gsm_rel_df_column_index, inplace=True)
dataf_gsm_txt_df.to_excel('gsmrelation_mnm.xlsx', 'Sheet1', index=False)
def push_umtscarrier(self):
dataf_umts_car_txt = 'umtscarrier_mnm.txt'
dataf_umts_car_txt_df = pd.read_csv(dataf_umts_car_txt, sep=';')
dataf_umts_car_df_column_index = list(dataf_umts_car_txt_df.columns)
dataf_umts_car_txt_df.reset_index(inplace=True)
dataf_umts_car_txt_df.drop(columns=dataf_umts_car_txt_df.columns[-1], inplace=True)
dataf_umts_car_df_column_index = dict(zip(list(dataf_umts_car_txt_df.columns), dataf_umts_car_df_column_index))
dataf_umts_car_txt_df.rename(columns=dataf_umts_car_df_column_index, inplace=True)
dataf_umts_car_txt_df.to_excel('umtscarrier_mnm.xlsx', 'Sheet1', index=False)
def push_gsmrelation(self):
dataf_umts_rel_txt = 'umtsrelation_mnm.txt'
dataf_umts_txt_df = pd.read_csv(dataf_umts_rel_txt, sep=';')
dataf_umts_rel_df_column_index = list(dataf_umts_txt_df.columns)
dataf_umts_txt_df.reset_index(inplace=True)
dataf_umts_txt_df.drop(columns=dataf_umts_txt_df.columns[-1], inplace=True)
dataf_umts_rel_df_column_index = dict(zip(list(dataf_umts_txt_df.columns), dataf_umts_rel_df_column_index))
dataf_umts_txt_df.rename(columns=dataf_umts_rel_df_column_index, inplace=True)
dataf_umts_txt_df.to_csv('umtsrelation_mnm.csv')
def get_push_all_to_csv():
return push_all_to_csv()
I just need to structure code some thing like this using pandas function like to_csv, 'to_excel', where, isin.... alot of funtions and some of sql queries as a support as the design of code structure as I imagine should be like the below:
database_connection/
│
├── .conncetion.py
to_sql/
│
├── .file_tosql_1.py
├── .file1_tosql_.py
to_csv/
├── .file1_tosql_.py
├── .file1_tosql_.py
assets/
Main_App.py
As I found a way to reduce methods calling through the below code:
#Method that call in the methods which in the class
def call_all(obj, *args, **kwargs):
for name in dir(obj):
attribute = getattr(obj, name)
if ismethod(attribute):
attribute(*args, **kwargs)
Any suggestion?
Edited
def clean(path):
df = pd.read_csv(path, sep=';')
cols = df.columns.tolist()
df = df.reset_index().drop(columns=df.columns[-1])
new_cols = dict(zip(list(df.columns), cols))
df = df.rename(columns=new_cols)
new_file_path = path.split(".")[:-1].append(".xlsx")
df.to_csv(path, index=False)
def main():
path = r"C:\Users\haroo501\PycharmProjects\ToolUpdated\data_feed"
pathPat = os.path.join(path, "*.txt")
all_file_name = glob.glob(pathPat)
for file_path in all_file_name:
print (file_path)
clean(file_path)
if __name__ == "__main__":
main()
Edited 2:
path = r"C:\\Users\\haroo501\\PycharmProjects\\ToolUpdated\\data_feed"
pathPat = os.path.join(path,"*.txt")
all_file_name = glob.glob(pathPat)
for file_path in all_file_name:
with open(file_path) as currentFile:
pd.read_csv(currentFile, delimiter = "\t")
print(file_path)
Error:
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "C:\Program Files\JetBrains\PyCharm Community Edition 2019.3.2\plugins\python-ce\helpers\pydev\_pydev_bundle\pydev_umd.py", line 197, in runfile
pydev_imports.execfile(filename, global_vars, local_vars) # execute the script
File "C:\Program Files\JetBrains\PyCharm Community Edition 2019.3.2\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/haroo501/PycharmProjects/ToolUpdated/txt_to_csv/convert_to_csv.py", line 64, in <module>
pd.read_csv(currentFile, delimiter = "\t")
File "C:\Users\haroo501\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 676, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\haroo501\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 448, in _read
parser = TextFileReader(fp_or_buf, **kwds)
File "C:\Users\haroo501\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 880, in __init__
self._make_engine(self.engine)
File "C:\Users\haroo501\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 1114, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "C:\Users\haroo501\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py", line 1891, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas\_libs\parsers.pyx", line 532, in pandas._libs.parsers.TextReader.__cinit__
pandas.errors.EmptyDataError: No columns to parse from file
Question 1
import os
import glob
path = r"D:\txtFiles"
pathPat = os.path.join(path,"*.txt")
all_file_name = glob.glob(pathPat)
for fileName in all_file_name:
with open(fileName) as currentFile:
#do somthing with the file
You can use the glob library to access the file as mentioned by trigonom above. And use a single function to iterate over all the files and save them back.
UPDATE: I was using path.split(".")[:-1] to create an .xlsx filename from the original name, however this would return list of strings and hence caused the errors. You can replace the extension to the file using str.replace.
import os
import glob
import pandas as pd
def clean(path):
df = pd.read_csv(path, sep=';')
cols = df.columns.tolist()
df = df.reset_index().drop(columns=df.columns[-1])
new_cols = dict(zip(list(df.columns), cols))
df = df.rename(columns=new_cols)
# new change here
new_file_path = path.replace(".txt", ".xlsx")
print (new_file_path)
df.to_excel(path, sheet_name="Sheet1", index=False)
def main():
path = r"D:\txtFiles"
pathPat = os.path.join(path, "*.txt")
all_file_name = glob.glob(pathPat)
for file_path in all_file_name:
print (file_path)
clean(file_path)
if __name__ == "__main__":
main()
I am made a code to process many csv files. For each one of them, I want to extract all rows corresponding to non-empty cells of a column called "20201-2.0". Have a look in the attached example (this is column LCE):
https://uoe-my.sharepoint.com/personal/gpapanas_ed_ac_uk/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Fgpapanas%5Fed%5Fac%5Fuk%2FDocuments%2FCSV%20File%20screenshot%2EPNG&parent=%2Fpersonal%2Fgpapanas%5Fed%5Fac%5Fuk%2FDocuments&originalPath=aHR0cHM6Ly91b2UtbXkuc2hhcmVwb2ludC5jb20vOmk6L2cvcGVyc29uYWwvZ3BhcGFuYXNfZWRfYWNfdWsvRWF5QmJsRlRIbVZKdlJmc0I2aDhWcjRCMDlJZmpRMkwxSTVPUUtVTjJwNXd6dz9ydGltZT10V2Y0c2Q1UzEwZw
I made the following code to perform this:
import pandas as pd
import glob
import os
path = './'
#column = ['20201-2.0']
all_files = glob.glob(path + "/*.csv")
for filename in all_files:
# Option 1 below worked, although without isolating the non-nulled values
# 1. df = pd.read_csv(filename, encoding="ISO-8859-1")
df = pd.read_csv(filename, header = 0)
df = df[df['20201-2.0'].notnull()]
print('extracting info from cvs...')
print(df)
# You can now export all outcomes in new csv files
file_name = filename + 'new' + '.csv'
save_path = os.path.abspath(
os.path.join(
path, file_name
)
)
print('saving ...')
export_csv = df.to_csv(save_path, index=None)
del df
del export_csv
However, although I manage to generate the first file, I get the following error:
Traceback (most recent call last):
File "/home/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2657, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: '20201-2.0'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/PycharmProjects/OPTIMAT/Read_MR_from_all_csv.py", line 21, in <module>
df = df[df['20201-2.0'].notnull()]
File "/home/giorgos/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "/home/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: '20201-2.0'
I can't understand why this is happening. Any ideas would be greatly appreciated.
Happy to say that I found a way to do this:
import pandas as pd
import glob
import os
import numpy as np
path = './'
#column = ['20201-2.0']
# all_files = glob.glob(path + "/*.csv")
#li = []
all_files = os.listdir(path)
all_df = pd.DataFrame()
for filename in all_files:
if not filename.endswith('csv'):
continue
print('extracting info from ' + filename)
# Option 1 below worked, although without isolating the non-nulled values
# 1. df = pd.read_csv(filename, encoding="ISO-8859-1")
df = pd.read_csv(filename, header=0)
#df = df[df['20201-2.0'].notnull()]
df_subset = df.dropna(subset=['20201-2.0'])
print('processed ' + filename)
# You can now export all outcomes in new csv files
file_name = filename.split('.')[0] + '_new' + '.csv'
print('saving to' + file_name)
export_csv = df_subset.to_csv('./' + file_name, index=None)
del df
del export_csv
The below code is used to calculate statistical values.
import re
from pathlib import Path
import pandas as pd
def prepare_values(df):
df_columns = ['frame.time_delta_displayed', 'frame.len']
df_values = []
for col in df_columns:
df_values +=[
df[col].max(),
df[col].min(),
df[col].std(),
df[col].quantile(0.25),
df[col].quantile(0.5),
df[col].quantile(0.75),
df[col].mean(),
df[col].mad(),
df[col].var(),
df[col].skew(),
df[col].kurtosis(),
df[col].sum(),
]
return df_values
source_dir = Path('/media/root/HASARA/Snipping Experiment-App Activities/Time-0.5/InOutFiltered')
in_data = []
for file in source_dir.glob('**/*.in.csv'):
activity = {'activity': file.stem.split('.')[0]}
df = pd.read_csv(file)
cols =['maxTimeIn', 'minTimeIn', 'stdTimeIn', 'q1TimeIn', 'q2TimeIn', 'q3TimeIn', 'meanTimeIn', 'madTimeIn', 'varianceTimeIn', 'skewTimeIn', 'kurtosisTimeIn', 'sumTimeIn', 'maxLenIn', 'minLenIn', 'stdLenIn', 'q1LenIn','q2lenIn', 'q3LenIn', 'meanLenIn', 'madLenIn', 'varianceLenIn', 'skewLenIn', 'kurtosisLenIn', 'sumLenIn']
values = prepare_values(df)
file_data ={**activity, **dict(zip(cols,values))}
in_data.append(file_data)
out_data =[]
for file in source_dir.glob('**/*.out.csv'):
activity = {'activity': file.stem.split('.')[0]}
df = pd.read_csv(file)
cols =['maxTimeOut', 'minTimeOut', 'stdTimeOut', 'q1TimeOut', 'q2TimeOut', 'q3TimeOut', 'meanTimeOut', 'madTimeOut', 'varianceTimeOut', 'skewTimeOut', 'kurtosisTimeOut', 'sumTimeOut', 'maxLenOut', 'minLenOut', 'stdLenOut', 'q1LenOut', 'q2LenOut', 'q3LenOut', 'meanLenOut', 'madLenOut', 'varianceLenOut', 'skewLenOut', 'kurtosisLenOut','sumLenOut']
values=prepare_values(df)
file_data = {**activity, **dict(zip(cols, values))}
out_data.append(file_data)
in_df = pd.DataFrame(in_data)
out_df = pd.DataFrame(out_data)
all_df = in_df.join(out_df.set_index('activity'), on='activity', how='outer')
all_df.dropna(subset=all_df.columns.tolist()[1:], how='all', inplace=True)
all_df.fillna(0, inplace=True)
all_df['activity'] = all_df['activity'].apply(lambda x:re.sub(r'^([a-zA-Z]+).*', r'\1',x))
all_df.to_csv('/media/root/HASARA/Snipping Experiment-App Activities/Time-0.5/AllDataNew.csv', index=False)
I am getting an error. Can't figure out what it means.
Traceback (most recent call last):
File "/root/PycharmProjects/AppAct/StatisticCal.py", line 48, in <module>
all_df= in_df.join(out_df.set_index('activity'), on='activity', how='outer')
File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 4178, in set_index
level = frame[col]._values
File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'activity'