check if the csv file exists and do the condition? - python

Hi I am working on csv file and I have a data I want to append these data to the csv file. But firstly I want to check if the csv file exists if TRUE then just open the csv file and append the data to csv file and save it, if NOT just create a DataFrame and with these data and save it.
Note: I have a csv file in my I want to append the sample of data to my csv file
thanks in advance.
here is my trying.
#sample of data
ID = 5
img_Latitude = 38786454
img_Longitude = 1118468
meta_lat = 45778
meta_long = 886556
#create a function
def create_csv( ID, img_Latitude, img_Longitude,meta_lat, meta_long):
#check if the file is exists, if True
if os.path.isfile('C:/My/Path/compare_coordinates.csv'):
#read the csv file
df = pd.read_csv('compare_coordinates.csv')
#make pd.series
data = pd.Series([ID, img_Latitude, img_Longitude, meta_lat, meta_long],
index=['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long'])
#append the data to df
df.append(data, ignore_index=True)
else:
data = [ID, img_Latitude, img_Longitude, meta_lat, meta_long]
columns = ['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long']
df = pd.DataFrame(data, columns).T
df.to_csv('C:/My/Path/compare_coordinates.csv', index=False)

The line df.append(data, ignore_index = True) needs to be:
df = df.append(data, ignore_index = True)
This is because DatFrame.append returns a new DF with the appended lines, it does not append in-place:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html

To get the values that needed must be saved in variable so for the line
df.append(data, ignore_index = True) to be edited to df = df.append(data, ignore_index = True) and for the getting value of file exists or not as following codes:
def create_csv( ID, img_Latitude, img_Longitude,meta_lat, meta_long):
Path = os.path.isfile('My/path/compare_coordinates1.csv')
if Path==True:
df = pd.read_csv('compare_coordinates1.csv')
data = pd.Series([ID, img_Latitude, img_Longitude, meta_lat, meta_long],
index=['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long'])
df = df.append(data, ignore_index=True)
else:
data = [ID, img_Latitude, img_Longitude, meta_lat, meta_long]
columns = ['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long']
df = pd.DataFrame(data, columns).T
df.to_csv('My/path/compare_coordinates1.csv', index=False)

Related

Read CSV file with Python function

I'm trying to write my read/write function to a csv, but it can't return any value.
I'm reading from a CSV, replacing the " ; " in the second column with " " and performing and saving the csv already handled.
But for some reason it doesn't save my csv, is my function wrong?
I'm starting out in the Python world, and I'm having a bit of trouble.
import pandas as pd
header_col = ['col0','col1','col2','col3','col4','col5','col6','col7','col8','col9']
df = pd.read_csv('myfile_<date>.csv', encoding="ISO-8859-1", sep=';', names=header_col, header=None)
def file_load(df):
df['col1'] = df['col1'].str.replace(';',' ')
df.drop(columns=['col8'], inplace=True)
df.drop(columns=['col9'], inplace=True)
return df
def save_file(dataframe):
df = dataframe
df.to_csv('myfile_<date>_treat.csv' ,sep=';', encoding='utf-8', index=False)
import pandas as pd
def file_load(df):
df['col1'] = str(df['col1']).replace(';',' ')
df.drop(columns=['col8'], inplace=True)
df.drop(columns=['col9'], inplace=True)
return df
def save_file(dataframe):
df = dataframe
df.to_csv('myfile_<date>_treat.csv' ,sep=',', encoding='utf-8',
index=False)
def main():
header_col=
['col0','col1','col2','col3','col4','col5','col6','col7','col8','col9']
df = pd.read_csv('myfile_<date>.csv', encoding="ISO-8859-1", sep=';',
names=header_col, header=None)
df1 = file_load(df)
save_file(df1)
if __name__ == '__main__':
main()

saving json files into one single csv

I have 100s of similar json files and I want to save the contents of these json files into one single csv file. This is the code I wrote for the same. But it's not doing what I want to do.
Desired output is csv file: https://drive.google.com/file/d/1cgwdbnvETLf6nO1tNnH0F_-fLxUOdT7L/view?usp=sharing
Please tell me what can be done to get the above output? Thanks
JSON file format: https://drive.google.com/file/d/1-OZYrfUtDJmwcRUjpBgn59zJt5MjtmWt/view?usp=sharing
list_=['politifact13565', 'politifact13601']
for i in list_:
with open("{}/news content.json".format(i)) as json_input:
json_data = json.load(json_input, strict=False)
mydict = {}
mydict["url"] = json_data["url"]
mydict["text"] = json_data["text"]
mydict["images"]=json_data["images"]
mydict["title"]=json_data["title"]
df = pd.DataFrame.from_dict(mydict, orient='index')
df = df.T
df.append(df, ignore_index=True)
df.to_csv('out.csv')
print(df)
SOLVED:
list_=['politifact13565', 'politifact13601']
for i in list_:
with open("{}/news content.json".format(i)) as json_input:
json_data = json.load(json_input, strict=False)
mydict = {}
mydict["url"] = json_data["url"]
mydict["text"] = json_data["text"]
mydict["images"]=json_data["images"]
mydict["title"]=json_data["title"]
df = pd.DataFrame.from_dict(mydict, orient='index')
df = df.T
df.append(df, ignore_index=True)
df.to_csv('out.csv', mode='a', header=False)
print(df)
Your solution is quite close to the desired output, you just need to transpose the imported json:
import glob
directory = "your/path/to/jsons/*.json"
df = pd.concat([pd.read_json(f, orient="index").T for f in glob.glob(directory)], ignore_index=True)
Aferwards you can save the df using df.to_csv("tweets.csv")
Hopefully that helps you!
list_=['politifact13565', 'politifact13601']
for i in list_:
with open("{}/news content.json".format(i)) as json_input:
json_data = json.load(json_input, strict=False)
mydict = {}
mydict["url"] = json_data["url"]
mydict["text"] = json_data["text"]
mydict["images"]=json_data["images"]
mydict["title"]=json_data["title"]
df = pd.DataFrame.from_dict(mydict, orient='index')
df = df.T
df.append(df, ignore_index=True)
df.to_csv('out.csv', mode='a', header=False)
print(df)

Read multiple file in python and generate one output

I have a python script for generating 1 upload file from 1 input file.
The thing is that the input files have started coming in batches, 30-50 at one time.
e.g.:
1111.xlsx --> upload.xlsx
1125.xlsx --> upload.xlsx
1176.xlsx --> upload.xlsx
1322.xlsx --> upload.xlsx
The code just converting the input files in the upload format.
Here's what I have done so far (1 input file -> 1 output file):
def main():
initial_workbook = 'C:/files/1111.xlsx'
temp_df = pd.ExcelFile(initial_workbook)
initial_df = pd.read_excel(initial_workbook, sheet_name = "default")
#drop first 4 rows to set header
new_header = initial_df.iloc[2]
initial_df = initial_df.iloc[3:]
initial_df.columns = new_header
#drop all rows with no data
indexNames = initial_df[initial_df['grade'] == 'select'].index
initial_df.drop(indexNames , inplace=True)
initial_df.dropna(axis=1, how='all')
output = initial_df.to_excel('C:/files/upload_file.xlsx', index = False)
Is there a way to generate one upload file for all the files from the input folder. And once the files input files have been processed, rename them by prefixing x in front of it. e.g. x1111.xlsx
So here is how I will approach, for a given batch:
from datetime import datetime
import os
from pathlib import Path
all_dfs = []
proj_path = Path("C:/files/")
for f in os.listdir(proj_path):
if f.endswith(".xlsx"):
print(f"processing {f}...")
df_tmp = main(proj_path / f)
df_tmp["file_name"] = f
all_dfs.append(df_tmp)
df_all = pd.concat(all_dfs, axis=0)
df_all.to_excel(proj_path / f"{datetime.now()}_batch.xlsx", index = False)
def main(f):
initial_workbook = proj_path / f
temp_df = pd.ExcelFile(initial_workbook)
initial_df = pd.read_excel(initial_workbook, sheet_name = "default")
#drop first 4 rows to set header
new_header = initial_df.iloc[2]
initial_df = initial_df.iloc[3:]
initial_df.columns = new_header
#drop all rows with no data
indexNames = initial_df[initial_df['grade'] == 'select'].index
initial_df.drop(indexNames, inplace=True)
initial_df.dropna(axis=1, how='all', inplace=True)
return initial_df
You can potentially enclose the logic for a batch in a function.

Python adding multiple excel sheet into the same pdf

I am using Ubuntu 16.0.4. After reading from an excel file, I am trying to add multiple excel sheet to a pdf file.
df = pd.read_excel(excel_name, sheet_name = 'Sheet1')
df = df.dropna(axis = 1, how='all')
df = df.dropna(how='all')
df.to_html("file.html")
pdf_name = name_of_file + '.pdf'
pdfkit.from_file("file.html", pdf_name)
How can I add another excel sheet from the same excel file to the same pdf file without overwriting the previous sheet that is in the pdf?
Thanks!
If the two sheets have the same data structure (columns and etc.):
df1 = pd.read_excel(excel_name, sheet_name = 'Sheet1')
df2 = pd.read_excel(excel_name, sheet_name = 'Sheet2')
df = df1.append(df2)
If not:
df1 = pd.read_excel(excel_name, sheet_name = 'Sheet1')
df2 = pd.read_excel(excel_name, sheet_name = 'Sheet2')
# Do whatever you need to transform the dfs
html_str = '<br />'.join([df1.to_html(), df2.to_html()])
with open("file.html", "w") as text_file:
text_file.write(html_str)
pdf_name = name_of_file + '.pdf'
pdfkit.from_file("file.html", pdf_name)

Iterate and split excel filenames and save as dataframe in Pandas

Say I have a folder folder1 with excel files, their filenames share same structures: city, building name and id, I want save them in dataframe and then excel file. Please note I also need to append other folders' excel filenames in result.
bj-LG center-101012.xlsx
sh-ABC tower-1010686.xlsx
bj-Jinzhou tower-101018.xlsx
gz-Zijin building-101012.xls
...
The first method I have tried:
import os
import pandas as pd
from pandas import DataFrame, ExcelWriter
path = os.getcwd()
file = [".".join(f.split(".")[:-1]) for f in os.listdir() if os.path.isfile(f)] #exclude files' extension
city = file.split('-')[0]
projectName = file.split('-')[1]
projectID = file.split('-')[2]
#print(city)
df = pd.DataFrame(columns = ['city', 'building name', 'id'])
df['city'] = city
df['building name'] = projectName
df['id'] = projectID
writer = pd.ExcelWriter("C:/Users/User/Desktop/test.xlsx", engine='xlsxwriter')
df.to_excel(writer, index = False)
writer.save()
Problem:
Traceback (most recent call last):
File "<ipython-input-203-c09878296e72>", line 9, in <module>
city = file.split('-')[0]
AttributeError: 'list' object has no attribute 'split'
My second method:
for root, directories, files in os.walk(path):
#print(root)
for file in files:
if file.endswith('.xlsx') or file.endswith('.xls'):
#print(file)
city = file.split('-')[0]
projectName = file.split('-')[1]
projectID = file.split('-')[2]
#print(city)
df = pd.DataFrame(columns = ['city', 'building name', 'id'])
df['city'] = city
df['building name'] = projectName
df['id'] = projectID
writer = pd.ExcelWriter("C:/Users/User/Desktop/test.xlsx", engine='xlsxwriter')
df.to_excel(writer, index = False)
writer.save()
I got an empty test.xlsx file, how could I make it works? Thanks.
This splits off the file extension, then unpacks the split into the vairables.
Creates a dictionary then appends the dictionary to the dataframe.
files = [
"bj-LG center-101012.xlsx",
"sh-ABC tower-1010686.xlsx",
"bj-Jinzhou tower-101018.xlsx",
"gz-Zijin building-101012.xls"]
df = pd.DataFrame()
for file in files:
filename = file.split(".")[0]
city, projectName, projectID = filename.split("-")
d = {'city':city,'projectID':projectID,'projectName':projectName}
df = df.append(d,ignore_index=True)
df.to_excel('summary.xlsx')
Method 2 is close.
You need to create the dataframe before the for loops. After your variable assignments, make a dictionary of the variables and append it to the dataframe.
There is also probably a better way to find your file list using glob, but i will just work with what you have already done.
df = pd.DataFrame()
for root, directories, files in os.walk(path):
for file in files:
if file.endswith('.xlsx') or file.endswith('.xls'):
#print(file)
city = file.split('-')[0]
projectName = file.split('-')[1]
projectID = file.split('-')[2]
#append data inside inner loop
d = {'city':city, 'building name':projectname, 'id':projectID}
df.append(d)
writer = pd.ExcelWriter("C:/Users/User/Desktop/test.xlsx", engine='xlsxwriter')
df.to_excel(writer, index = False)
writer.save()
This should works, thanks to the hint of use glob from #Dan Wisner
import os
from glob import glob
fileNames = [os.path.splitext(val)[0] for val in glob('*.xlsx') or glob('*.xls')]
df = pd.DataFrame({'fileNames': fileNames})
df[['city', 'name', 'id']] = df['fileNames'].str.split('-', n=2, expand=True)
del df['fileNames']
writer = pd.ExcelWriter("C:/Users/User/Desktop/test1.xlsx", engine='xlsxwriter')
df.to_excel(writer, index = False)
writer.save()

Categories

Resources