CSV file processing in Python - python

I work with spatial data that is output to text files with the following format:
COMPANY NAME
P.O. BOX 999999
ZIP CODE , CITY
+99 999 9999
23 April 2013 09:27:55
PROJECT: Link Ref
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Design DTM is 30MB 2.5X2.5
Stripping applied to design is 0.000
Point Number Easting Northing R.L. Design R.L. Difference Tol Name
3224808 422092.700 6096059.380 2.520 -19.066 -21.586 --
3224809 422092.200 6096059.030 2.510 -19.065 -21.575 --
<Remainder of lines>
3273093 422698.920 6096372.550 1.240 -20.057 -21.297 --
Average height difference is -21.390
RMS is 21.596
0.00 % above tolerance
98.37 % below tolerance
End of Report
As shown, the files have a header and a footer. The data is delimited by spaces, but not an equal amount between the columns.
What I need, is comma delimited files with Easting, Northing and Difference.
I'd like to prevent having to modify several hundred large files by hand and am writing a small script to process the files. This is what I have so far:
#! /usr/bin/env python
import csv,glob,os
from itertools import islice
list_of_files = glob.glob('C:/test/*.txt')
for filename in list_of_files:
(short_filename, extension )= os.path.splitext(filename)
print short_filename
file_out_name = short_filename + '_ed' + extension
with open (filename, 'rb') as source:
reader = csv.reader( source)
for row in islice(reader, 10, None):
file_out= open (file_out_name, 'wb')
writer= csv.writer(file_out)
writer.writerows(reader)
print 'Created file: '+ file_out_name
file_out.close()
print 'All done!'
Questions:
How can I let the line starting with 'Point number' become the header in the output file? I'm trying to put DictReader in place of the reader/writer bit but can't get it to work.
Writing the output file with delimiter ',' does work but writes a comma in place of each space, giving way too much empty columns in my output file. How do I circumvent this?
How do I remove the footer?

I can see a problem with your code, you are creating a new writer for each row; so you will end up only with the last one.
Your code could be something like this, without the need of CSV readers or writers, as it's simple enough to be parsed as simple text (problem would arise if you had text columns, with escaped characters and so on).
def process_file(source, dest):
found_header = False
for line in source:
line = line.strip()
if not header_found:
#ignore everything until we find this text
header_found = line.starswith('Point Number')
elif not line:
return #we are done when we find an empty line, I guess
else:
#write the needed columns
columns = line.split()
dest.writeline(','.join(columns[i] for i in (1, 2, 5)))
for filename in list_of_files:
short_filename, extension = os.path.splitext(filename)
file_out_name = short_filename + '_ed' + extension
with open(filename, 'r') as source:
with open(file_out_name. 'w') as dest:
process_file(source, dest)

This worked:
#! /usr/bin/env python
import glob,os
list_of_files = glob.glob('C:/test/*.txt')
def process_file(source, dest):
header_found = False
for line in source:
line = line.strip()
if not header_found:
#ignore everything until we find this text
header_found = line.startswith('Stripping applied') #otherwise, header is lost
elif not line:
return #we are done when we find an empty line
else:
#write the needed columns
columns = line.split()
dest.writelines(','.join(columns[i] for i in (1, 2, 5))+"\n") #newline character adding was necessary
for filename in list_of_files:
short_filename, extension = os.path.splitext(filename)
file_out_name = short_filename + '_ed' + ".csv"
with open(filename, 'r') as source:
with open(file_out_name, 'wb') as dest:
process_file(source, dest)

To answer to your first and last question: it is simply about ignoring the corresponding lines, i.e. not to write them to output. This corresponds to if not header_found and else if not line: blocks of fortran proposal.
Second point is that there is no dedicated delimiter in your file: you have one or more spaces, which makes it hard to be parsed using csv module. Using split() will parse each line and return the list of non-blank characters, and will therefore only return useful values.

Related

\ in the end of each line of a txt

I want to write the pathnames of images into a .txt in python. A create a new line for each path yet It creates an unwanted \ at the end of each image. I tried to strip it yet nothing happens. How can I strip it?
for num, name in enumerate(dirNames):
os.mkdir(name)
os.chdir(name)
with open(f'{os.path.basename(name)}.txt', 'w+', encoding = 'utf-8') as f:
for tile in png_cropedShips[num]:
path = os.path.join('tiles', os.path.basename(name), os.path.basename(tile),'\n')
print(path)
f.write(path)
for line in f:
line.rstrip('\\')
The output:
tiles\T34TBL20211001T094041\T34TBL_20211001T094041_c38_r9_ndwi.png\
tiles\T34TBL20211001T094041\T34TBL_20211001T094041_c39_r0_ndwi.png\
tiles\T34TBL20211001T094041\T34TBL_20211001T094041_c39_r12_ndwi.png\
tiles\T34TBL20211001T094041\T34TBL_20211001T094041_c39_r13_ndwi.png\
path = "a\\string\\"
path = path.removesuffix("\\")
print(path)
will print:
a\string
This should do it, when using backslashes you should place them two times in a string because it kind of ignores the first one.
The correct output was achieved by writing the newline character separately from the images paths!
for num, name in enumerate(dirNames):
os.mkdir(name)
os.chdir(name)
with open(f'{os.path.basename(name)}.txt', 'w+', encoding = 'utf-8') as f:
for tile in png_cropedShips[num]:
path = os.path.join('tiles', os.path.basename(name), os.path.basename(tile))
f.write(path)
f.write('\n')

Writing csv file from list but not all elements follow up

I have to format data from a text book to csv file. In the text book my data are already separated by space so i make a list of string (some contains multiple data separated by space).
When I try to write my list into text file it works well, but when I try to write it into a CSV file, in the middle of a string the writing stops and goes to the next element in my list. Don't know why more that half of my data don't follow up. There is no end line character or whatever.
Here is my simple code
# importing libraries
import os
# defining location of parent folder
BASE_DIRECTORY = r'C:\Users\CAVC071777\Documents\1_Projet\Riverstart\Intrant EDPR\6-Background Harmonics Data at POI\test'
output_file = open('output.csv', 'w')
output = []
outputString = ""
file_list = []
i = 0
# scanning through sub folders
for (dirpath, dirnames, filenames) in os.walk(BASE_DIRECTORY):
for f in filenames:
if 'txt' in str(f):
e = os.path.join(str(dirpath), str(f))
file_list.append(e)
for f in file_list:
txtfile = open(f, 'r')
i = 0
for line in txtfile:
if i == 3:
outputString = "=Date(""{0}"",""{1}"",""{2}"")+TEMPS(""{3}"",""{4}"",""{5}"")".format(line[46:48],line[40:42],line[43:45],line[58:60],line[61:63],line[64:66])
if i > 8 and i < 71:
outputString += line[9:71]
i = i + 1
output.append(outputString)
outputString = ""
for row in output:
print(row)
output_file.write(row + "\n")
When I open it in my csv file all the data after 0.830% didn't follow up:
When I print my list of string containing my data in the terminal it's well formatted and all my data is there:
The text files that i try to read on is like this :
ET H
WHM1 SEL-735 Date: 09/17/19 Time: 11:46:03.726
HDW Time Source: ext
Fundamental Frequency = 60.0
Harmonic IA IB IC IN VA VB VC
2 0.166% 0.137% 0.166% 0.000% 0.000% 0.020% 0.010%
3 ...... ......
And so forth till 60
image of the text file i try to read on
You have two problems here:
You are building a space separated file
you are using Excel
Excel is known to have a very poor support for csv files. Long story made short in you read a csv file build from Excel on the same system, it will work smoothly. I you read a csv file specifically build for your system, it should work. In any other use case it may or not load correctly...
Here Excel expects the delimiter to be a ; as it is the default delimiter for a French locale, or , if you managed to tell it that. As there are none in the rows, it just tries to put everything in the first cell, and visibly limits the length of a single field.
How to fix:
use LibreOffice or OpenOffice. Both are far beyond Excel for almost all features except csv. You could declare at load time that the separator is a space and control that the lines are correctly parsed
Change the rows in th csv file to use the separator that your version of Excel expects

Replace decimal with comma in multiple text files

I have 10 TAB delimited txt files in a folder. It has three columns (with numbers only) preceeded by a 21 line header (text and numbers). In order to process them further, I would like to :
Choose the second column from all text files (starting after the 21 line header; i attached figure with arrow), convert the comma into decimal and stack each of these columns from the 10 files into a new tab delimited/csv file. Once all files.
I know very little scripting. I have Rstudio and Python and have tried to fiddle around a bit. But I have really no clue what to do. Since I have to process multiple folders, my work would be really simplified if it could be possible.
Reference figure
From your requirements it sounds like this Python code should do the trick:
import os
import glob
DIR = "path/to/your/directory"
OUTPUT_FILE = "path/to/your/output.csv"
HEADER_SIZE = 21
input_files = glob.glob(os.path.join(DIR, "*.txt"))
for input_file in input_files:
print("Now processing", input_file)
# read the file
with open(input_file, "r") as h:
contents = h.readlines()
# drop header
contents = contents[HEADER_SIZE:]
# grab the 2nd column
column = []
for row in contents:
# stop at the footer
if "####" in row:
break
split = row.split("\t")
if len(split) >= 2:
column.append(split[1])
# replace the comma
column_replaced = [row.replace(",", ".") for row in column]
# append to the output file
with open(OUTPUT_FILE, "a") as h:
h.write("\n".join(column_replaced))
h.write("\n") # end on a newline
Note that this will discard everything that wasn't part of the second column in the output file.
The code below is not an exact solution but if you follow along its lines you will be close to what you need.
output <- "NewFileName.txt"
old_dir <- setwd("your/folder")
files <- list.files("\\.txt")
df_list <- lapply(files, read.table, skip = 21, sep = "\t")
x <- lapply(df_list, '[[', 2)
x <- gsub(",", ".", unlist(x))
write.table(x, output, row.names = FALSE, col.names = FALSE)
setwd(old_dir)
list =[]
filename = "my_text"
file = open(filename, "r")
for line in file:
res=line.replace(",", ".")
list.append(res)
print(res)
f = open(filename, "w")
for item in list:
f.write(item)`enter code here`

import filenames iteratively from a different file

I have a large number of entries in a file. Let me call it file A.
File A:
('aaa.dat', 'aaa.dat', 'aaa.dat')
('aaa.dat', 'aaa.dat', 'bbb.dat')
('aaa.dat', 'aaa.dat', 'ccc.dat')
I want to use these entries, line by line, in a program that would iteratively pick an entry from file A, concatenate the files in this way:
filenames = ['aaa.dat', 'aaa.dat', 'ccc.dat'] ###entry number 3
with open('out.dat', 'w') as outfile: ###the name has to be aaa-aaa-ccc.dat
for fname in filenames:
with open(fname) as infile:
outfile.write(infile.read().strip())
All I need to do is to substitute the filenames iteratively and create an output in a "aaa-aaa-aaa.dat" format. I would appreciate any help-- feeling a bit lost!
Many thanks!!!
You can retrieve and modify the file names in the following way:
import re
pattern = re.compile('\W')
with open('fnames.txt', 'r') as infile:
for line in infile:
line = (re.sub(pattern, ' ', line)).split()
# Old filenames - to concatenate contents
content = [x + '.dat' for x in line[::2]];
# New filename
new_name = ('-').join(line[::2]) + '.dat'
# Write the concatenated content to the new
# file (first read the content all at once)
with open(new_name, 'w') as outfile:
for con in content:
with open(con, 'r') as old:
new_content = old.read()
outfile.write(new_content)
This program reads your input file, here named fnames.txt with the exact structure from your post, line by line. For each line it splits the entries using a precompiled regex (precompiling regex is suitable here and should make things faster). This assumes that your filenames are only alphanumeric characters, since the regex substitutes all non-alphanumeric characters with a space.
It retrieves only 'aaa' and dat entries as a list of strings for each line and forms a new name by joining every second entry starting from 0 and adding a .dat extension to it. It joins using a - as in the post.
It then retrieves the individual file names from which it will extract the content into a list content by selecting every second entry from line.
Finally, it reads each of the files in content and writes them to the common file new_name. It reads each of them all at ones which may be a problem if these files are big and in general there may be more efficient ways of doing all this. Also, if you are planning to do more things with the content from old files before writing, consider moving the old file-specific operations to a separate function for readability and any potential debugging.
Something like this:
with open(fname) as infile, open('out.dat', 'w') as outfile:
for line in infile:
line = line.strip()
if line: # not empty
filenames = eval(line.strip()) # read tuple
filenames = [f[:-4] for f in filenames] # remove extension
filename = '-'.join(filenames) + '.dat' # make filename
outfile.write(filename + '\n') # write
If your problem is just calculating the new filenames, how about using os.path.splitext?
'-'.join([
f[0] for f in [os.path.splitext(path) for path in filenames]
]) + '.dat'
Which can be probably better understood if you see it like this:
import os
clean_fnames = []
filenames = ['aaa.dat', 'aaa.dat', 'ccc.dat']
for fname in filenames:
name, extension = os.path.splitext(fname)
clean_fnames.append(name)
name_without_ext = '-'.join(clean_fnames)
name_with_ext = name_without_ext + '.dat'
print(name_with_ext)
HOWEVER: If your issue is that you can not get the filenames in a list by reading the file line by line, you must keep in mind that when you read files, you get text (strings) NOT Python structures. You need to rebuild a list from a text like: "('aaa.dat', 'aaa.dat', 'aaa.dat')\n".
You could take a look to ast.literal_eval or try to rebuild it yourself. The code below outputs a lot of messages to show what's happening:
import pprint
collected_fnames = []
with open('./fileA.txt') as f:
for line in f:
print("Read this (literal) line: %s" % repr(line))
line_without_whitespaces_on_the_sides = line.strip()
if not line_without_whitespaces_on_the_sides:
print("line is empty... skipping")
continue
else:
line_without_parenthesis = (
line_without_whitespaces_on_the_sides
.lstrip('(')
.rstrip(')')
)
print("Cleaned parenthesis: %s" % line_without_parenthesis)
chunks = line_without_parenthesis.split(', ')
print("Collected %s chunks in a %s: %s" % (len(chunks), type(chunks), chunks))
chunks_without_quotations = [chunk.replace("'", "") for chunk in chunks]
print("Now we don't have quotations: %s" % chunks_without_quotations)
collected_fnames.append(chunks_without_quotations)
print("collected %s lines with filenames:\n%s" %
(len(collected_fnames), pprint.pformat(collected_fnames)))

How to read the last few lines within a file using Python?

I am reading a folder with a specific file name. I am reading the content within a file, but how do I read specific lines or the last 6 lines within a file?
************************************
Test Scenario No. 1
TestcaseID = FB_71125_1
dpSettingScript = FB_71125_1_DP.txt
************************************
Setting Pre-Conditions (DP values, Sqlite DB):
cp /fs/images/nfs/FileRecogTest/MNT/test/Databases/FB_71125_1_device.sqlite $NUANCE_DB_DIR/device.sqlite
"sync" twice.
Starting the test:
0#00041511#0000000000# FILERECOGNITIONTEST: = testScenarioNo (int)1 =
0#00041514#0000000000# FILERECOGNITIONTEST: = TestcaseID (char*)FB_71125_1 =
0#00041518#0000000000# FILERECOGNITIONTEST: = dpSettingScript (char*)FB_71125_1_DP.txt =
0#00041520#0000000000# FILERECOGNITIONTEST: = UtteranceNo (char*)1 =
0#00041524#0000000000# FILERECOGNITIONTEST: = expectedEventData (char*)0||none|0||none =
0#00041528#0000000000# FILERECOGNITIONTEST: = expectedFollowUpDialog (char*) =
0#00041536#0000000000# FILERECOGNITIONTEST: /fs/images/nfs/FileRecogTest/MNT/test/main_menu.wav#MEDIA_COND:PAS_MEDIA&MEDIA_NOT_BT#>main_menu.global<#<FS0000_Pos_Rec_Tone><FS1000_MainMenu_ini1>
0#00041789#0000000000# FILERECOGNITIONTEST: Preparing test data done
0#00043768#0000000000# FILERECOGNITIONTEST: /fs/images/nfs/FileRecogTest/MNT/test/Framework.wav##>{any_device_name}<#<FS0000_Pos_Rec_Tone><FS1400_DeviceDisambig_<slot>_ini1>
0#00044008#0000000000# FILERECOGNITIONTEST: Preparing test data done
0#00045426#0000000000# FILERECOGNITIONTESTWARNING: expected >{any_device_name}<, got >lowconfidence1#FS1000_MainMenu<
1900#00046452#0000000000# FILERECOGNITIONTESTERROR: expected <FS0000_Pos_Rec_Tone><FS1400_DeviceDisambig_<slot>_ini1>, got <FS0000_Misrec_Tone><FS1000_MainMenu_nm1_004><pause300><FS1000_MainMenu_nm_001>
0#00046480#0000000000# FILERECOGNITIONTEST: Preparing test data done
0#00047026#0000000000# FILERECOGNITIONTEST: Stopping dialog immediately
[VCALogParser] Scenario 1 FAILED.
Can someone suggest me how to read specific lines, or the last 6 lines within a file ?
I can think of two methods. If your files are not too big, you can just read all lines, and keep only the last six ones:
f = open(some_path)
last_lines = f.readlines()[-6:]
But that's really brute-force. Something cleverer is to make a guess, using the seek() method of your file object:
file_size = os.stat(some_path).st_size # in _bytes_, so take care depending on encoding
f = open(some_path)
f.seek(file_size - 1000) # here's the guess. Adjust with expected line length
last_lines = f.readline()[-6:]
To read the last 6 lines of a single file, you could use Python's file.seek to move near to the end of the file and then read the remaining lines. You need to decide what the maximum line length could possibly be, e.g. 1024 characters.
The seek command is first used to move to the end of the file (without reading it in), tell is used to determine with position in the file (as we are at the end, this will be the length). It then goes backwards in the file and reads the lines in. If the file is very short, the whole file is read in.
import os
filename = r"C:\Users\hemanth_venkatappa\Desktop\TEST\Language\test.txt"
back_up = 6 * 1024 # Go back from the end more than 6 lines worth.
with open(filename, "r") as f_input:
f_input.seek(0, os.SEEK_END)
backup = min(back_up, f_input.tell())
f_input.seek(-backup, os.SEEK_END)
print f_input.readlines()[-6:]
Using with will ensure your file is automatically closed afterwards. Prefixing your file path with r avoids you needing to double backslash your file path.
So to then apply this to your directory walk and write your results to a separate output file, you could do the following:
import os
import re
back_up = 6 * 256 # Go back from the end more than 6 lines worth
directory = r"C:\Users\hemanth_venkatappa\Desktop\TEST\Language"
output_filename = r"C:\Users\hemanth_venkatappa\Desktop\TEST\output.txt"
with open(output_filename, 'w') as f_output:
for dirpath, dirnames, filenames in os.walk(directory):
for filename in filenames:
if filename.startswith('VCALogParser_output'):
cur_file = os.path.join(dirpath, filename)
with open(cur_file, "r") as f_input:
f_input.seek(0, os.SEEK_END)
backup = min(back_up , f_input.tell())
f_input.seek(-backup, os.SEEK_END)
last_lines = ''.join(f_input.readlines()[-6:])
try:
summary = ', '.join(re.search(r'(\d+ warning\(s\)).*?(\d+ error\(s\)).*?(\d+ scenarios\(s\))', last_lines, re.S).groups())
except AttributeError:
summary = "No summary"
f_output.write('{}: {}\n'.format(filename, summary))
Or, essentially, use a for loop to append lines to an array and then remove the nth number of items from the array like:
array=[]
f=open("file.txt","r")
for lines in f:
array.append(f.readlines())
f.close()
while len(array) > 5:
del array[0]

Categories

Resources