Merge semicolon delimited txt file looping in directory - python

Suppose I have many different text files from the same directory with the content structure as shown below:
File a.txt:
HEADER_X;HEADER_Y;HEADER_Z
a_value;a_value;a_value
a_value;a_value;a_value
File b.txt:
HEADER_X;HEADER_Y;HEADER_Z
b_value;b_value;b_value
b_value;b_value;b_value
File c.txt:
HEADER_X;HEADER_Y;HEADER_Z
c_value;c_value;c_value
c_value;c_value;c_value
File d.txt: ...
I'd like to merge all of the txt files into one, by appending the content of each file to the final row of the each previous file. See below:
File combined.txt:
HEADER_X;HEADER_Y;HEADER_Z
a_value;a_value;a_value
a_value;a_value;a_value
b_value;b_value;b_value
b_value;b_value;b_value
c_value;c_value;c_value
c_value;c_value;c_value
...
How can I do this in Python?
Assumptions:
- all txt files are located in the same folder
- all txt files have same headers
- all txt files have same number of columns
- all txt files have different number of rows

Use the CSV Module. Something like this:
import csv
with ('output.csv', 'ab') as output:
writer = csv.writer(output, delimiter=";")
with open('a.txt', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=";")
reader.readline() // this is to skip the header
for row in spamreader:
writer.writerow(row)
If you didn't want to harcode in every file (Say you have many more than three) you could do:
from os import listdir
from os.path import isfile, join
onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
for aFile in onlyfiles:
with open(aFile, 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=";")
reader.readline() // this is to skip the header
for row in spamreader:
writer.writerow(row)

I managed to do something that seems to work (at least in the cases I tested).
This will parse all files, get all the headers and format the values on each line of each file to add ";" according to the headers present/absent on that file.
headers = []
values = []
files = ("csv0.txt", "csv1.txt")#put the files you want to parse here
#read the files a first time, just to get the headers
for file_name in files:
file = open(file_name, 'r')
first_line = True
for line in file:
if first_line:
first_line = False
for header in line.strip().split(";"):
if header not in headers:
headers.append(header)
else:
break
file.close()
headers = sorted(headers)
#read a second time to get the values
file_number = 0
for file_name in files:
file = open(file_name, 'r')
file_headers = []
first_line = True
corresponding_indexes = []
values.append([])
for line in file:
if first_line:
first_line = False
index = 0
for header in line.strip().split(";"):
while headers[index] != header:
index += 1
corresponding_indexes.append(index)
else:
line_values = line.strip().split(";")
current_index = 0
values_str = ""
for value in line_values:
#this part write the values with ";" added for the headers not in this file
while current_index not in corresponding_indexes:
current_index += 1
values_str += ";"
values_str += value + ";"
current_index += 1
values_str = values_str[:-1] #we remove the last ";" (useless)
values[file_number].append(values_str)
file_number += 1
file.close()
#and now we write the output file with all headers and values
headers_str = ""
for header in headers:
headers_str += header + ";"
headers_str = headers_str[:-1]
output_file = open("output.txt", 'w')
output_file.write(headers_str + "\n")
for file_values in values:
for values_line in file_values:
output_file.write(values_line + "\n")
output_file.close()
If you have any question, feel free to ask.

Related

How to covert each row of a csv file to a separate text file?

I have this data:
Now I want to convert each row of this csv file to text file and the text file name should be the name of the first element of the row (i.e. 0d4b3896f938d981.txt) and the text file will contain only the elements of Column 1 to 5 (i.e. 3 0.372070 0.873913 0.066406 0.158261)
maybe i can try this :
import csv
import os
base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
csv_file open(base + "/any.csv", mode='r')
dialect = csv.excel()
dialect.delimiter = ';'
csv_reader = csv.DictReader(csv_file, dialect=dialect)
txt_file open(base + "/any.csv", mode='w')
txt_file = txt.writer()
for row in csv_reader:
txt_file.writerow((row, ))
txt_file.save()
txt_file.close()
csv_file.close()
I hope it helps you, greetings.

How to split a large file into chunks using python

import csv
divisor = 1000000
outfileno = 1
outfile = None
with open('o77069882.txt', 'r') as infile:
infile_iter = csv.reader(infile)
header = next(infile_iter)
for index, row in enumerate(infile_iter):
if index % divisor == 0:
if outfile is not None:
outfile.close()
outfilename = 'big-{}.csv'.format(outfileno)
outfile = open(outfilename, 'w')
outfileno += 1
writer = csv.writer(outfile)
writer.writerow(header)
writer.writerow(row)
# Don't forget to close the last file
if outfile is not None:
outfile.close()
I am using this above code to divide my file into chunks of size 100000, it does the job but each row in the first file is getting enclosed by quotes( " ") as for example "abc, dfg, ghj, kil" . The second and third file created are not having this problem, can anyone help me modify my code to get rid of the above error.

Python read CSV file columns and write file name and column name in a csv file

I have many CSV files, need to read all the files in loop and write file name and all the columns (header in row 1) in an output file.
Example
Input csv file 1 (test1.csv)
Id, Name, Age, Location
1, A, 25, India
Input csv file 2 (test2.csv)
Id, ProductName
1, ABC
Outputfile
test1.csv Id
test1.csv Name
test1.csv Age
test1.csv Location
test2.csv Id
test2.csv ProductName
Many thanks for your help.
Update:
This code works fine for this purpose:
import os
import csv
ofile = open('D:\Anuj\Personal\OutputFile/AHS_File_Columns_Info.csv', 'w')
directory = os.path.join('D:\Anuj\Personal\Python')
for root, dirs, files in os.walk(directory):
for file in files:
fullfilepath = directory + "/" + file
with open(fullfilepath,'r') as f:
output = file +','+ f.readline()
ofile.write(output)
clean solution using csv module for reading and writing
open output file and create a csv.writer instance on its handle
open each input file and create a csv.reader instance on their handle
get first row using next on the csv.reader iterator: gets titles as list (with a small post-processing to remove the spaces)
write titles alongside the current filename in a loop
code:
import csv
files=["test1.csv","test2.csv"]
with open("output.tsv","w",newline='') as fw:
cw = csv.writer(fw,delimiter="\t") # output is tab delimited
for filename in files:
with open(filename,'r') as f:
cr = csv.reader(f)
# get title
for column_name in (x.strip() for x in next(cr)):
cw.writerow([filename,column_name])
There are several advantages using csv module, the most important being that quoting & multi-line fields/titles are managed properly.
But I'm not sure I understand you correctly.
import csv
from typing import List
from typing import Tuple
TableType = List[List[str]]
def load_csv_table(file_name: str) -> Tuple[List[str], TableType]:
with open(file_name) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
headers = next(csv_reader)
data_table = list(csv_reader)
return headers, data_table
def save_csv_table(file_name: str, headers: List[str], data_table: TableType):
with open(file_name, 'w', newline='') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(headers)
for row in data_table:
writer.writerow(row)
input_files = ['file1.csv', 'file2.csv', 'file3.csv']
new_table = []
new_headers = []
for file_name in input_files:
headers, data_table = load_csv_table(file_name)
if not new_headers:
new_headers = ['Source'] + headers
new_table.extend(([file_name] + line for line in data_table))
save_csv_table('output.csv', new_headers, new_table)
A simple method is to use readline() on the file object:
files=["test1.csv","test2.csv"]
for my_file in files:
with open(my_file,'r') as f:
print my_file, f.readline()

csv value modification for certain cells on odd rows on a particular column

Hi I'm trying to finish this small piece of code for modifying csv files, I've got this far with some help:
edit... some more info.
Basically what I’m looking to do is make some small changes to the csv file depending on the project and parent issue in JIRA. Python will then make the changes to the csv file before it is then read into JIRA - that’s the second part of the program I’ve not even really looked at yet.
I’m only looking to change the BOX-123 type cells and leave the blank ones blank.
But the idea of the program is that I can use it to make some small changes to a template which will then automatically create some issues in JIRA.
import os
import csv
project = 'Dudgeon'
parent = 'BOX-111'
rows = (1,1007)
current = os.getcwd()
filename = 'test.csv'
filepath = os.path.join(os.getcwd(), filename)
#print(current)
#print(filename)
print(filepath)
with open(filepath, 'r') as csvfile:
readCSV = csv.reader(csvfile)
next(readCSV, None)
for row in readCSV:
print(row[16])
row_count =sum(1 for row in readCSV)
print(row_count)
with open(filepath, 'r') as infile, open('out.csv', 'w') as outfile:
outfile.write(infile.readline()) # write out the 1st line
for line in infile:
cols = line.strip().split(',')
cols[16] = project
outfile.write(','.join(cols) + '\n')
with open('out.csv', 'r') as infile, open('out1.csv', 'w') as outfile:
for row in infile:
if row % 2 != 0:
cols [15] = parent
outfile.write()
Any help really appreciated.
You want to use the row's index when comparing to 0. Use enumerate():
with open('out.csv', 'r') as infile, open('out1.csv', 'w') as outfile:
for rowidx,row in enumerate(infile):
cols = row.strip().split(',')
if rowidx % 2 != 0:
cols[15] = parent
outfile.write(cols)
You really should be using the csv module here, though. Untested but should get you started.
with open('out.csv', 'r') as infile, open('out1.csv', 'w') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for rowidx,row in enumerate(reader):
if rowidx % 2 != 0:
row[15] = parent
writer.write_row(row)
A friend helped me last night and this is what they came up with:
with open(filepath, 'r') as infile, open('out.csv', 'w') as outfile:
outfile.write(infile.readline()) # write out the 1st line
for line in infile:
cols = line.strip().split(',')
cols[16] = project
outfile.write(','.join(cols) + '\n')
with open('out.csv', 'r') as infile, open('out1.csv', 'w') as outfile:
outfile.write(infile.readline()) # write out the 1st line
lineCounter = 0
for line in infile:
lineCounter += 1
cols = line.strip().split(',')
if lineCounter % 2 != 0:
cols[15] = parent
outfile.write(','.join(cols) + '\n')

Want to read multiple csv file one by one and filepaths are stored in a text file using python

here is my code for readinng individual cell of one csv file. but want to read multiple csv file one by one from .txt file where csv file paths are located.
import csv
ifile = open ("C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv", "rb")
data = list(csv.reader(ifile, delimiter = ';'))
REQ = []
RES = []
n = len(data)
for i in range(n):
x = data[i][1]
y = data[i][2]
REQ.append (x)
RES.append (y)
i += 1
for j in range(2,n):
try:
if REQ[j] != '' and RES[j]!= '': # ignore blank cell
print REQ[j], ' ', RES[j]
except:
pass
j += 1
And csv file paths are stored in a .txt file like
C:\Desktop\Test_Specification\RDBI.csv
C:\Desktop\Test_Specification\ECUreset.csv
C:\Desktop\Test_Specification\RDTC.csv
and so on..
You can read stuff stored in files into variables. And you can use variables with strings in them anywhere you can use a literal string. So...
with open('mytxtfile.txt', 'r') as txt_file:
for line in txt_file:
file_name = line.strip() # or was it trim()? I keep mixing them up
ifile = open(file_name, 'rb')
# ... the rest of your code goes here
Maybe we can fix this up a little...
import csv
with open('mytxtfile.txt', 'r') as txt_file:
for line in txt_file:
file_name = line.strip()
csv_file = csv.reader(open(file_name, 'rb', delimiter=';'))
for record in csv_file[1:]: # skip header row
req = record[1]
res = record[2]
if len(req + res):
print req, ' ', res
you just need to add a while which will read your file containing your list of files & paths upon your first open statement, for example
from __future__ import with_statement
with open("myfile_which_contains_file_path.txt") as f:
for line in f:
ifile = open(line, 'rb')
# here the rest of your code
You need to use a raw string string your path contains \
import csv
file_list = r"C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv"
with open(file_list) as f:
for line in f:
with open(line.strip(), 'rb') as the_file:
reader = csv.reader(the_file, delimiter=';')
for row in reader:
req,res = row[1:3]
if req and res:
print('{0} {1}'.format(req, res))

Categories

Resources