Read CSV file with Python function - python

I'm trying to write my read/write function to a csv, but it can't return any value.
I'm reading from a CSV, replacing the " ; " in the second column with " " and performing and saving the csv already handled.
But for some reason it doesn't save my csv, is my function wrong?
I'm starting out in the Python world, and I'm having a bit of trouble.
import pandas as pd
header_col = ['col0','col1','col2','col3','col4','col5','col6','col7','col8','col9']
df = pd.read_csv('myfile_<date>.csv', encoding="ISO-8859-1", sep=';', names=header_col, header=None)
def file_load(df):
df['col1'] = df['col1'].str.replace(';',' ')
df.drop(columns=['col8'], inplace=True)
df.drop(columns=['col9'], inplace=True)
return df
def save_file(dataframe):
df = dataframe
df.to_csv('myfile_<date>_treat.csv' ,sep=';', encoding='utf-8', index=False)

import pandas as pd
def file_load(df):
df['col1'] = str(df['col1']).replace(';',' ')
df.drop(columns=['col8'], inplace=True)
df.drop(columns=['col9'], inplace=True)
return df
def save_file(dataframe):
df = dataframe
df.to_csv('myfile_<date>_treat.csv' ,sep=',', encoding='utf-8',
index=False)
def main():
header_col=
['col0','col1','col2','col3','col4','col5','col6','col7','col8','col9']
df = pd.read_csv('myfile_<date>.csv', encoding="ISO-8859-1", sep=';',
names=header_col, header=None)
df1 = file_load(df)
save_file(df1)
if __name__ == '__main__':
main()

Related

The csv writer is writing some un-realistic values to the csv in python

In my code, the csv-writer is writing some un-realistic values to the CSV file.
My goal is to read all csv files in one directory and put filter on any specific column and write the filtered dataframe to a consolidated csv file.
I am able to get the outputs as required in the VS console, but I am not able to write them into a csv file.
Kindly help to understand what I am doing incorrect.
This is my sample input:
And this is the output I am getting:
Code:
import pandas as pd
import os
import glob
import csv
from pandas.errors import EmptyDataError
# use glob to get all the csv files
# in the folder
path = os.getcwd()
#print(path)
csv_files = glob.glob(os.path.join(path, "*.csv"))
print(csv_files)
col_name = input("Enter the column name to filter: ")
print(col_name)
State_Input = input("Enter the {} ".format(col_name) )
print(State_Input)
df_empty = pd.DataFrame()
for i in csv_files:
try:
df = pd.read_csv(i)
#print(df.head(5))
State_Filter = df["State"] == State_Input
print(df[State_Filter])
df_child = (df[State_Filter])
with open('D:\\PythonProjects\\File-Split-Script\\temp\\output\\csv_fil111.csv', 'w') as csvfile:
data_writer = csv.writer(csvfile, dialect = 'excel')
for row in df_child:
data_writer.writerows(row)
except EmptyDataError as e:
print('There was an error in your input, please try again :{0}'.format(e))
Use pd.to_csv to write your file at once. Prefer store your filtered dataframes into a list then concatenate all of them to a new dataframe:
import pandas as pd
import pathlib
data_dir = pathlib.Path.cwd()
# Your input here
state = input('Enter the state: ') # Gujarat, Bihar, ...
print(state)
data = []
for csvfile in data_dir.glob('*.csv'):
df = pd.read_csv(csvfile)
df = df.loc[df['State'] == state]]
data.append(df)
df = pd.concat(data, axis=1, ignore_index=True)
df.to_csv('output.csv', axis=0)

I am trying to filter 800 hundred csv files

I am trying to filter 800 hundred csv files. but gives no entry in first run. the script filters data after having 3 to 4 runing it not on the first time without any error.
import pandas as pd
import os
from datetime import datetime
def filter_data(filenames, search_keyword):
df_resultant = pd.DataFrame([])
for filename in filenames:
col_names = ["col1", "col2", "col3", "col4", "col5"]
# excel_file_df = pd.read_csv(filename, index_col=False, nrows=0)
# excel_file_df = pd.read_csv(filename, low_memory=False, names=col_names, dtype={'col1': str,'col4': str})
# excel_file_df = pd.read_csv(
# filename, low_memory=False, names=col_names)
excel_file_df = pd.read_csv(
filename, dtype='unicode', names=col_names)
df = excel_file_df[excel_file_df['col3'].str.contains(
search_keyword, na=False, case=False)]
if not df.empty:
df_copy = df.copy()
df_copy.loc[:, 'file_name'] = os.path.basename(filename)
df_copy.columns = [''] * len(df_copy.columns)
df_resultant = pd.concat([df_resultant, df_copy])
df_resultant.to_csv(
'./output/' + str(datetime.now().timestamp())+'.csv', index=False)

How to use multiprocess to update pandas.dataframe in python?

The goal of the following code is to sort the big_df by timestamp and ordered by userid. Finally, I get the selected_df.
import pandas as pd
from collections import Counter
def init_process():
big_df = pd.Dataframe()
big_df = ... # It contains four columns and many rows.
big_df.columns = ['userid', 'itemid', 'ratings', 'timestamp']
users_list = Counter(big_df['userid'])
selected_df = pd.DataFrame() # it is an empty dataframe
for uid in users_list:
small_df = big_df [(big_df[0] == uid )]
new_user_items_df = small_df.sort_values(by=['timestamp'])
selected_df = selected_df.append(new_user_items_df, ignore_index=True)
if __name__ == '__main__':
init_process()
But when the big_df is so large, sometimes 80G, it will take many days to process it. Therefore, I want to use multiprocess to make it parallel. Then, I have the following code.
import pandas as pd
from collections import Counter
from multiprocessing import Pool
selected_df = pd.Dataframe()
big_df = pd.Dataframe()
def process_pd(uid):
global selected_df
small_df = big_df[(big_df[0] == uid)]
new_user_items_df = small_df.sort_values(by=['timestamp'])
selected_df = selected_df.append(new_user_items_df, ignore_index=True)
def init_process():
global big_df
big_df = ... # It contains four columns and many rows.
big_df.columns = ['userid', 'itemid', 'ratings', 'timestamp']
users_list = Counter(big_df['userid'])
selected_df = pd.DataFrame() # it is an empty dataframe
num_cpus = 5
process_pool = Pool(processes=num_cpus)
dfs = process_pool.map(process_pd, users_list)
print(selected_df)
if __name__ == '__main__':
init_process()
The selected_df is empty. I think maybe it is because the multiple processes do not update the global dataframe selected_df. But I am new of pandas, could anyone tell me how to modify this code?
Thank you very much.

saving json files into one single csv

I have 100s of similar json files and I want to save the contents of these json files into one single csv file. This is the code I wrote for the same. But it's not doing what I want to do.
Desired output is csv file: https://drive.google.com/file/d/1cgwdbnvETLf6nO1tNnH0F_-fLxUOdT7L/view?usp=sharing
Please tell me what can be done to get the above output? Thanks
JSON file format: https://drive.google.com/file/d/1-OZYrfUtDJmwcRUjpBgn59zJt5MjtmWt/view?usp=sharing
list_=['politifact13565', 'politifact13601']
for i in list_:
with open("{}/news content.json".format(i)) as json_input:
json_data = json.load(json_input, strict=False)
mydict = {}
mydict["url"] = json_data["url"]
mydict["text"] = json_data["text"]
mydict["images"]=json_data["images"]
mydict["title"]=json_data["title"]
df = pd.DataFrame.from_dict(mydict, orient='index')
df = df.T
df.append(df, ignore_index=True)
df.to_csv('out.csv')
print(df)
SOLVED:
list_=['politifact13565', 'politifact13601']
for i in list_:
with open("{}/news content.json".format(i)) as json_input:
json_data = json.load(json_input, strict=False)
mydict = {}
mydict["url"] = json_data["url"]
mydict["text"] = json_data["text"]
mydict["images"]=json_data["images"]
mydict["title"]=json_data["title"]
df = pd.DataFrame.from_dict(mydict, orient='index')
df = df.T
df.append(df, ignore_index=True)
df.to_csv('out.csv', mode='a', header=False)
print(df)
Your solution is quite close to the desired output, you just need to transpose the imported json:
import glob
directory = "your/path/to/jsons/*.json"
df = pd.concat([pd.read_json(f, orient="index").T for f in glob.glob(directory)], ignore_index=True)
Aferwards you can save the df using df.to_csv("tweets.csv")
Hopefully that helps you!
list_=['politifact13565', 'politifact13601']
for i in list_:
with open("{}/news content.json".format(i)) as json_input:
json_data = json.load(json_input, strict=False)
mydict = {}
mydict["url"] = json_data["url"]
mydict["text"] = json_data["text"]
mydict["images"]=json_data["images"]
mydict["title"]=json_data["title"]
df = pd.DataFrame.from_dict(mydict, orient='index')
df = df.T
df.append(df, ignore_index=True)
df.to_csv('out.csv', mode='a', header=False)
print(df)

check if the csv file exists and do the condition?

Hi I am working on csv file and I have a data I want to append these data to the csv file. But firstly I want to check if the csv file exists if TRUE then just open the csv file and append the data to csv file and save it, if NOT just create a DataFrame and with these data and save it.
Note: I have a csv file in my I want to append the sample of data to my csv file
thanks in advance.
here is my trying.
#sample of data
ID = 5
img_Latitude = 38786454
img_Longitude = 1118468
meta_lat = 45778
meta_long = 886556
#create a function
def create_csv( ID, img_Latitude, img_Longitude,meta_lat, meta_long):
#check if the file is exists, if True
if os.path.isfile('C:/My/Path/compare_coordinates.csv'):
#read the csv file
df = pd.read_csv('compare_coordinates.csv')
#make pd.series
data = pd.Series([ID, img_Latitude, img_Longitude, meta_lat, meta_long],
index=['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long'])
#append the data to df
df.append(data, ignore_index=True)
else:
data = [ID, img_Latitude, img_Longitude, meta_lat, meta_long]
columns = ['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long']
df = pd.DataFrame(data, columns).T
df.to_csv('C:/My/Path/compare_coordinates.csv', index=False)
The line df.append(data, ignore_index = True) needs to be:
df = df.append(data, ignore_index = True)
This is because DatFrame.append returns a new DF with the appended lines, it does not append in-place:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html
To get the values that needed must be saved in variable so for the line
df.append(data, ignore_index = True) to be edited to df = df.append(data, ignore_index = True) and for the getting value of file exists or not as following codes:
def create_csv( ID, img_Latitude, img_Longitude,meta_lat, meta_long):
Path = os.path.isfile('My/path/compare_coordinates1.csv')
if Path==True:
df = pd.read_csv('compare_coordinates1.csv')
data = pd.Series([ID, img_Latitude, img_Longitude, meta_lat, meta_long],
index=['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long'])
df = df.append(data, ignore_index=True)
else:
data = [ID, img_Latitude, img_Longitude, meta_lat, meta_long]
columns = ['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long']
df = pd.DataFrame(data, columns).T
df.to_csv('My/path/compare_coordinates1.csv', index=False)

Categories

Resources