Ignore some json files during parsing - python

I have the following code that reads some JSON files from a directory and returns them after some preprocessing. However, some of them are dict so they do not have the desired columns. As a result, I take back this error
KeyError: "None of [Index(['aaa', 'xxx'], dtype='object')] are in the [columns]"]
How to ignore them and continue with the other JSON files? Perhaps a try-except procedure?
import os, json
import pandas as pd
path_to_json = 'C:/Users/aaa/Desktop/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
def func(s):
try:
return eval(s)
except:
return dict()
list_of_df=[]
for i in range(len(json_files)):
file_name = json_files[i]
df = pd.read_json(file_name, lines=True)
df= df[['columnx']]
df = df['columnx'].apply(func)
df=pd.json_normalize(df)
df=pd.DataFrame(df[["xxx", "aaa"]])
list_of_df.append(df)
df=pd.concat(list_of_df)
df = df[['Index','xxx', 'aaa']]
df.head()

You have to add the try-except block added in your for loop which iterates over the json files.

Related

The csv writer is writing some un-realistic values to the csv in python

In my code, the csv-writer is writing some un-realistic values to the CSV file.
My goal is to read all csv files in one directory and put filter on any specific column and write the filtered dataframe to a consolidated csv file.
I am able to get the outputs as required in the VS console, but I am not able to write them into a csv file.
Kindly help to understand what I am doing incorrect.
This is my sample input:
And this is the output I am getting:
Code:
import pandas as pd
import os
import glob
import csv
from pandas.errors import EmptyDataError
# use glob to get all the csv files
# in the folder
path = os.getcwd()
#print(path)
csv_files = glob.glob(os.path.join(path, "*.csv"))
print(csv_files)
col_name = input("Enter the column name to filter: ")
print(col_name)
State_Input = input("Enter the {} ".format(col_name) )
print(State_Input)
df_empty = pd.DataFrame()
for i in csv_files:
try:
df = pd.read_csv(i)
#print(df.head(5))
State_Filter = df["State"] == State_Input
print(df[State_Filter])
df_child = (df[State_Filter])
with open('D:\\PythonProjects\\File-Split-Script\\temp\\output\\csv_fil111.csv', 'w') as csvfile:
data_writer = csv.writer(csvfile, dialect = 'excel')
for row in df_child:
data_writer.writerows(row)
except EmptyDataError as e:
print('There was an error in your input, please try again :{0}'.format(e))
Use pd.to_csv to write your file at once. Prefer store your filtered dataframes into a list then concatenate all of them to a new dataframe:
import pandas as pd
import pathlib
data_dir = pathlib.Path.cwd()
# Your input here
state = input('Enter the state: ') # Gujarat, Bihar, ...
print(state)
data = []
for csvfile in data_dir.glob('*.csv'):
df = pd.read_csv(csvfile)
df = df.loc[df['State'] == state]]
data.append(df)
df = pd.concat(data, axis=1, ignore_index=True)
df.to_csv('output.csv', axis=0)

How can we read JSON data from URL, convert to dataframe, and save as CSV

I'm playing around with some code to read JSON encoded data from a URL, push it into a data frame and save the results to a CSV. The code that I attempted to run is shown below. I think this is pretty close, but something is wrong, because nothing gets downloaded.
import urllib
from urllib.request import urlopen
import json
import pandas as pd
from pandas.io.json import json_normalize
all_links = ['https://www.baptisthealthsystem.com/docs/global/standard-charges/474131755_abrazomaranahospital_standardcharges.json?sfvrsn=9a27928_2',
'https://www.baptisthealthsystem.com/docs/global/standard-charges/621861138_abrazocavecreekhospital_standardcharges.json?sfvrsn=674fd6f_2',
'https://www.baptisthealthsystem.com/docs/global/standard-charges/621809851_abrazomesahospital_standardcharges.json?sfvrsn=13953222_2',
'https://www.baptisthealthsystem.com/docs/global/standard-charges/621811285_abrazosurprisehospital_standardcharges.json?sfvrsn=c8113dcf_2']
for item in all_links:
#print(item)
try:
length = len(item)
first_under = item.find('_') + 1
last_under = item.rfind('?') - 21
file_name = item[first_under:last_under]
print(file_name)
# store the response of URL
response = urlopen(item)
data = json.loads(response.read())
#print(type(data))
data = json.loads(item.read().decode())
df = pd.DataFrame(json_normalize(data, 'metrics'), encoding='mac_roman')
DOWNLOAD_PATH = 'C:\\Users\\ryans\\Desktop\\hospital_data\\' + file_name + '.csv'
urllib.request.urlretrieve(df,DOWNLOAD_PATH)
except Exception as e: print(e)
Any thoughts on what could be wrong here?

I have multiple dataframes in a list. How to print their names?

I am reading in multiple files and adding them to a list:
import pandas as pd
import glob
import ntpath
path = r'C:\Folder1\Folder2\Folder3\Folder3'
all_files = glob.glob(path + "/*.dat") #.dat files only
mylist = []
for filename in all_files:
name = ntpath.basename(filename) # for renaming the DF
name = name.replace('.dat','') # remove extension
try:
name = pd.read_csv(filename, sep='\t', engine='python')
mylist.append(name)
except:
print(f'File not read:{filename}')
Now I want to just display the DFs in this list.
This is what I've tried:
for thing in mylist:
print(thing.name)
AttributeError: 'DataFrame' object has no attribute 'name'
And
for item in mylist:
print(item)
But that just prints the whole DF content.
name = pd.read_csv(filename, sep='\t', engine='python')
mylist.append(name)
Here, name is a dataframe, not the name of your dataframe.
To add name to your dataframe, use
df = pd.read_csv(filename, sep='\t', engine='python')
df_name="Sample name"
mylist.append({'data':df, 'name':df_name})
>>> print(thing['name'])
Sample name
You can use a dictionary for that.
Writing to dict:
import pandas as pd
import glob
import ntpath
path = r'C:\Folder1\Folder2\Folder3\Folder3'
all_files = glob.glob(path + "/*.dat") #.dat files only
mydict = {}
for filename in all_files:
name = ntpath.basename(filename) # for renaming the DF
name = name.replace('.dat','') # remove extension
try:
mydict[name] = pd.read_csv(filename, sep='\t', engine='python')
except:
print(f'File not read:{filename}')
To read a df (say filename1) again:
df = mydict['filename1']
or to iterate over all df's in mydict:
for df in mydict.values():
# use df...
or:
for key in mydict:
print(key)
df = mydict[key]
# use df...

AttributeError: 'list' object has no attribute 'to_hdf'

I am running following code which imports csv files and append all data into single DATA array. But while storing this array into HDF5, I am keep getting error AttributeError: 'list' object has no attribute 'to_hdf'.
Please help me understand what I am missing.
import pandas as pd
import h5py
import glob
import os
path = "Z:\Test Data"
def get_CSV_files(path):
results = []
for root, dirs, files in os.walk(path):
for file in files:
fileExt=os.path.splitext(file)[-1]
if fileExt.lower() == '.csv':
results.append(os.path.join(root, file))
for directory in dirs:
results += get_CSV_files(os.path.join(root, directory))
return results
def store_all_data_hdf5(path):
files = get_CSV_files(path)
DATA=[]
for file_name in files:
data = pd.DataFrame.from_csv(file_name, sep="\t")
DATA.append(data)
store = pd.HDFStore('STORE.h5')
DATA.to_hdf('STORE.h5','table', append=True)
store.close()
return DATA
DATA is a list you define by - DATA=[] and it does not have attribute to_hdf.
You can find some example of how to use pandas HDFStore here
And you would probably need something like -
def store_all_data_hdf5(path):
files = get_CSV_files(path)
DATA=[]
store = pd.HDFStore('STORE.h5')
for file_name in files:
data = pd.DataFrame.from_csv(file_name, sep="\t")
DATA.append(data)
store.put('my_file', data, append=True)
store.close()
return DATA

Try / Except in Python to show which file threw the error

I am writing a py to import in a large amount of files, manipulate them and then output to .csv. Which is cake in Pandas, however I have no control over the files coming in so I am trying to write the script to have an exception on how to handle if files come in the "wrong" way.
Anyway, I am using a Try/Except to show the user that there is a KeyError in one of the files (basicially there is a " in a cell when the datatype is int).
My question is: Is there a way to have the except: bring back the file name of the file that caused the error??
for csv in csvList:
df = pd.read_csv(csv, header=0, skip_blank_lines=True, skipinitialspace=True)\
.dropna(how='all')
try:
df[0] = df[0].astype(int)
df[1] = df[1].astype(int)
df[2] = df[2].astype(int)
df[3] = df[3].astype(int)
report_path = 'UPC_Ready_for_Import'
if not os.path.exists(report_path):
os.makedirs(report_path)
df.to_csv(os.path.join(report_path, csv + '_import.csv'), index=False)
except KeyError:
print('Error within file, please review files')
Assuming csvList contains list of input file paths:
for csv in csvList:
....
try:
...
except KeyError:
print('Error within file {}, please review files'.format(csv))
You could write, something like this, I guess:
for csv in csvList:
df = pd.read_csv(csv, header=0, skip_blank_lines=True, skipinitialspace=True)\
.dropna(how='all')
try:
df[0] = df[0].astype(int)
df[1] = df[1].astype(int)
df[2] = df[2].astype(int)
df[3] = df[3].astype(int)
report_path = 'UPC_Ready_for_Import'
if not os.path.exists(report_path):
os.makedirs(report_path)
file_name = os.path.join(report_path, csv + '_import.csv')
df.to_csv(file_name, index=False)
except KeyError:
print('Error within file', file_name ', please review files')
The main idea is to store the file name in a variable file_name and use it in the except block.

Categories

Resources