Modifying existing text files and resaving it

Modifying existing text files and resaving it - python

I have coded to view .txt and search on specific value. However I want to change that strings and resave the text files with the other content as it is.
import pandas as pd
import numpy as np
import os
txt_location = r'Desktop\Texts\RawData_txt'
files = os.listdir(txt_location)
df = pd.DataFrame(columns=['File Name', 'DG Code', 'location'])
for file in files:
with open(txt_location + '/' + file) as f:
res = f.read()
records = [list(map(str.strip, line.strip().split('\t'))) for line in res.split('\n\n')]
for string in records[0]:
if string.startswith('DG-PIN'):
l = [file, string, records[0].index(string)]
df.loc[len(df)] = l
f.close()
df.to_csv(r'Desktop\Texts\txt_List.csv')
then this file I add the new codes on excel file and read it on another Dataframe**
mod_df = pd.read_excel('Desktop\Texts\Handling_txt.xlsm')
However, I don't know how to change the texts files and resave them.

Related

How to convert multiple json files to cvs files

Hello I have multiple json files in a path and I want to convert all of them to csv files separately. Here is what I have tried so far which just convert one json file to a csv file.
with open('/Users/hh/MyDataSet/traceJSON-663-661-A0-25449-7.json') as f:
for line in f:
data.append(json.loads(line))
csv_file=open('/Users/hh/MyDataSet/GTruth/traceJSON-663-661-A0-25449-7.csv','w')
write=csv.writer(csv_file)
# write.writerow(["row number","type","rcvTime","pos_x","pos_y","pos_z","spd_x","spd_y","spd_z","acl_x","acl_y","acl_z"
# ,"hed_x","hed_y","hed_z"])
write.writerow(["row number","type","rcvTime","sender","pos_x","pos_y","pos_z","spd_x","spd_y","spd_z","acl_x","acl_y","acl_z"
,"hed_x","hed_y","hed_z"])
for elem in range(len(data)):
if data[elem]['type']==2:
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),'663',round(data[elem]['pos'][0],2),round(data[elem]['pos'][1],2)
,round(data[elem]['pos'][2],2),round(data[elem]['spd'][0],2),round(data[elem]['spd'][1],2),round(data[elem]['spd'][2],2),
round(data[elem]['acl'][0],2),round(data[elem]['acl'][1],2),round(data[elem]['acl'][2],2),round(data[elem]['hed'][0],2),
round(data[elem]['hed'][1],2),round(data[elem]['hed'][2],2)])
elif data[elem]['type']==3:
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),round(data[elem]['sender'],2),round(data[elem]['pos'][0],2),round(data[elem]['pos'][1],2)
,round(data[elem]['pos'][2],2),round(data[elem]['spd'][0],2),round(data[elem]['spd'][1],2),round(data[elem]['spd'][2],2),
round(data[elem]['acl'][0],2),round(data[elem]['acl'][1],2),round(data[elem]['acl'][2],2),round(data[elem]['hed'][0],2),
round(data[elem]['hed'][1],2),round(data[elem]['hed'][2],2)])
# json_file.close()
print('done!')
csv_file.close()
I appreciate if anyone can help me how can I do it. Also in each json file name "traceJSON-663-661-A0-25449-7", the first number like in the above code (663) should be written in csv file like the following code,if the type is 2:
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),'663',....
My json file names are like traceJSON-51-49-A16-25217-7, traceJSON-57-55-A0-25223-7, ....

I suggest using pandas for this:
from glob import glob
import pandas as pd
import os
filepaths = glob('/Users/hh/MyDataSet/*.json') # get list of json files in folder
for f in filepaths:
filename = os.path.basename(f).rsplit('.', 1)[0] # extract filename without extension
nr = int(filename.split('-')[1]) # extract the number from the filename - assuming that all filenames are formatted similarly, use regex otherwise
df = pd.read_json(f) # read the json file as a pandas dataframe, assuming the json file isn't nested
df['type'] = df['type'].replace(2, nr) # replace 2 in 'type' column with the number in the filename
df.to_csv(f'{filename}.csv') # save as csv
If you want to round columns, you can also do this with pandas

import csv
import glob
import json
import os.path
for src_path in glob.glob('/Users/hh/MyDataSet/*.json'):
src_name = os.path.splitext(os.path.basename(src_path))[0]
data = []
with open(src_path) as f:
for line in f:
data.append(json.loads(line))
dest_path = '/Users/hh/MyDataSet/GTruth/' + src_name + '.csv'
csv_file=open(dest_path,'w')
write=csv.writer(csv_file)
write.writerow(["row number","type","rcvTime","sender","pos_x","pos_y","pos_z","spd_x","spd_y","spd_z","acl_x","acl_y","acl_z"
,"hed_x","hed_y","hed_z"])
for elem in range(len(data)):
if data[elem]['type']==2:
sender = src_name.split('-')[1]
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),sender,round(data[elem]['pos'][0],2),round(data[elem]['pos'][1],2)
,round(data[elem]['pos'][2],2),round(data[elem]['spd'][0],2),round(data[elem]['spd'][1],2),round(data[elem]['spd'][2],2),
round(data[elem]['acl'][0],2),round(data[elem]['acl'][1],2),round(data[elem]['acl'][2],2),round(data[elem]['hed'][0],2),
round(data[elem]['hed'][1],2),round(data[elem]['hed'][2],2)])
elif data[elem]['type']==3:
write.writerow([elem,data[elem]['type'],round(data[elem]['rcvTime'],2),round(data[elem]['sender'],2),round(data[elem]['pos'][0],2),round(data[elem]['pos'][1],2)
,round(data[elem]['pos'][2],2),round(data[elem]['spd'][0],2),round(data[elem]['spd'][1],2),round(data[elem]['spd'][2],2),
round(data[elem]['acl'][0],2),round(data[elem]['acl'][1],2),round(data[elem]['acl'][2],2),round(data[elem]['hed'][0],2),
round(data[elem]['hed'][1],2),round(data[elem]['hed'][2],2)])
csv_file.close()
print('done!')

How to remove newline characters so it doesn't produce _x000D_ when reading excel files in Python [duplicate]

This question already has an answer here:
reading Excel file getting unicodes
(1 answer)
Closed 11 months ago.
I try to read Excel files from a series of folders where each folder contains lots(hundreds literally) of Excel files.
This is the process I've been doing:
import numpy as np
import os
import glob
def read_files(path):
df2 = pd.DataFrame()
data = pd.DataFrame()
for each in os.listdir(path):
sub_path = path+"/"+each
files = glob.glob(os.path.join(sub_path, "*.xlsx"))
for f in files:
df = pd.read_excel(f)
data = data.append(df)
df2 = df2.append(data)
return df2
df = read_files("...my_path")
However, it converts newline characters in the data that it read from excel files to _x000D_. For example, if a particular cell value is:
abcde
It becomes:
abcde_x000D_
How can I eliminate this, and actually remove the newlines at the reading step, so that it doesn't produce _x000D_?

Based on this related question, it seems that Excel uses \n chr(10) as newline, but your files have \r\n chr(13)chr(10). One way to avoid this is to perform a replace operation. Every time you see \r\n, replace it with \n. I modify the file in buffer before sending to pandas.
import numpy as np
import os
import glob
def read_files(path):
df2 = pd.DataFrame()
data = pd.DataFrame()
for each in os.listdir(path):
sub_path = path+"/"+each
files = glob.glob(os.path.join(sub_path, "*.xlsx"))
for f in files:
with open(f, 'rb') as fp:
text = fp.read().replace(b'\r\n', b'\n')
df = pd.read_excel(text)
data = data.append(df)
df2 = df2.append(data)
return df2
df = read_files("...my_path")

Renaming csv files according to some contents inside the csv file in python

I have many csv files in one subfolder, say data. Each of these .csv files contain a date column.
430001.csv, 43001(1).csv,43001(2).csv,..........,43001(110).csv etc.
I want to rename all the files in folder according to the date inside column of csv file.
Desired output:
430001-1980.csv, 43001-1981.csv,43001-1985.csv,..........,43001-2010.csv etc.
I tried to follow the steps advised in :
Renaming multiple csv files
Still could not get the desired output.
Any help would be highly appreciated.
Thanks!

You can loop through them, extract the date to create a new filename, and then save it.
# packages to import
import os
import pandas as pd
import glob
import sys
data_p = "Directory with your data"
output_p = "Directory where you want to save your output"
retval = os.getcwd()
print (retval) # see in which folder you are
os.chdir(data_p) # move to the folder with your data
os.getcwd()
filenames = sorted(glob.glob('*.csv'))
fnames = list(filenames) # get the names of all your files
#print(fnames)
for f in range(len(fnames)):
print(f'fname: {fnames[f]}\n')
pfile = pd.read_csv(fnames[f], delimiter=",") # read in file
#extract filename
filename = fnames[f]
parts = filename.split(".") # giving you the number in file name and .csv
only_id = parts[0].split("(") # if there is a bracket included
# get date from your file
filedate = pfile["date"][0] # assuming this is on the first row
filedate = str(filedate)
# get new filename
newfilename = only_id[0]+"-"+filedate+parts[1]
# save your file (don't put a slash at the end of your directories on top)
pfile.to_csv(output_p+"/"+newfilename, index = False, header = True)

Python combine CSVs, remove header and remove blanks

I’m extremely new to Python & trying to figure the below out:
I have multiple CSV files (monthly files) that I’m trying to combine into a yearly file. The monthly files all have headers, so I’m trying to keep the first header & remove the rest. I used the below script which accomplished this, however there are 10 blank rows between each month.
Does anyone know what I can add to this to remove the blank rows?
import shutil
import glob
#import csv files from folder
path = r'data/US/market/merged_data'
allFiles = glob.glob(path + "/*.csv")
allFiles.sort() # glob lacks reliable ordering, so impose your own if output order matters
with open('someoutputfile.csv', 'wb') as outfile:
for i, fname in enumerate(allFiles):
with open(fname, 'rb') as infile:
if i != 0:
infile.readline() # Throw away header on all but first file
# Block copy rest of file from input to output without parsing
shutil.copyfileobj(infile, outfile)
print(fname + " has been imported.")
Thank you in advance!

assuming the dataset isn't bigger than you memory, I suggest reading each file in pandas, concatenating the dataframes and filtering from there. blank rows will probably show up as nan.
import pandas as pd
import glob
path = r'data/US/market/merged_data'
allFiles = glob.glob(path + "/*.csv")
allFiles.sort()
df = pd.Dataframe()
for i, fname in enumerate(allFiles):
#append data to existing dataframe
df = df.append(pd.read(fname), ignore_index = True)
#hopefully, this will drop blank rows
df = df.dropna(how = 'all')
#write to file
df.to_csv('someoutputfile.csv')

Problrms reading csv files via numpy and pandas

I have a folder with a certain path and I want to go through all the folders within this folder. Each subfolder contains multiple files of which I only want a certain ".csv" file. I succeed in reading the different folders and selecting the correct file, but when I try to open it (bith with pandas and numpy), I get an IOError stating that the corresponding file doesn't exist..
import pandas as pd
import numpy as np
path = "some_path"
data_list = os.listdir(path)
array = []
for file in data_list:
if file.endswith("name_requirement"):
array.append(function(file))
file_filter = "second_name_requirement"
def function(second_path):
file_list = os.listdir(path + "\\" + str(second_path))
average = 0
for file in file_list:
if str(file)[-20:-4] == file_filter:
measurements = pd.read_csv(file, delimiter = ";", skiprows = 1, usecols = [1])
# measurements = np.loadtxt(file, delimiter =";", skiprows =1)
.....
`

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Modifying existing text files and resaving it - python

Related

How to convert multiple json files to cvs files

How to remove newline characters so it doesn't produce _x000D_ when reading excel files in Python [duplicate]

Renaming csv files according to some contents inside the csv file in python

Python combine CSVs, remove header and remove blanks

Problrms reading csv files via numpy and pandas

Categories

Resources