Python script to count num lines in all files in directory - python

So I'm new to python and I'm trying to write a script that iterates through all .txt files in a directory, counts the number of lines in each one (with exception to lines that are blank or commented out), and writes the final output to a csv. The final output should look something like this:
agprices, avi, adp
132, 5, 8
I'm having trouble with the syntax to save each count as the value of the dictionary. Here is my code below:
#!/usr/bin/env python
import csv
import copy
import os
import sys
#get current working dir, set count, and select file delimiter
d = os.getcwd()
count = 0
ext = '.txt'
#parses through files and saves to a dict
series_dict = {}
txt_files = [i for i in os.listdir(d) if os.path.splitext(i)[1] == ext]
#selects all files with .txt extension
for f in txt_files:
with open(os.path.join(d,f)) as file_obj:
series_dict[f] = file_obj.read()
if line.strip(): #Exclude blank lines
continue
else if line.startswith("#"): #Exclude commented lines
continue
else
count +=1
#Need to save count as val in dict here
#save the dictionary with key/val pairs to a csv
with open('seriescount.csv', 'wb') as f:
w = csv.DictWriter(f, series_dict.keys())
w.writeheader()
w.writerow(series_dict)
So here's the edit:
#!/usr/bin/env python
import csv
import copy
import os
import sys
import glob
#get current working dir, set count, and select file delimiter
os.chdir('/Users/Briana/Documents/Misc./PythonTest')
#parses through files and saves to a dict
series = {}
for fn in glob.glob('*.txt'):
with open(fn) as f:
series[fn] = (1 for line in f if line.strip() and not line.startswith('#'))
print series
#save the dictionary with key/val pairs to a csv
with open('seriescount.csv', 'wb') as f:
w = csv.DictWriter(f, series.keys())
sum(names.values())
I'm getting an indentation error on the 2nd to last line and am not quite sure why? Also, I'm not positive that I'm writing the syntax correctly on the last part. Again, I'm simply trying to return a dictionary with names of files and number of lines in files like {a: 132, b:245, c:13}

You can try something along these lines:
os.chdir(ur_directory)
names={}
for fn in glob.glob('*.txt'):
with open(fn) as f:
names[fn]=sum(1 for line in f if line.strip() and not line.startswith('#'))
print names
That will print a dictionary similar to:
{'test_text.txt': 20, 'f1.txt': 3, 'lines.txt': 101, 'foo.txt': 6, 'dat.txt': 6, 'hello.txt': 1, 'f2.txt': 4, 'neglob.txt': 8, 'bar.txt': 6, 'test_reg.txt': 6, 'mission_sp.txt': 71, 'test_nums.txt': 8, 'test.txt': 7, '2591.txt': 8303}
And you can use that Python dict in csv.DictWriter.
If you want the sum of those, just do:
sum(names.values())

I think you should make two changes to your script:
Use glob.glob() to get the list of files matching your desired suffix
Use for line in file_obj to iterate through the lines
Other problem:
The indentation is wrong on your last few lines

You could count your lines in your files with this 1-liner:
line_nums = sum(1 for line in open(f) if line.strip() and line[0] != '#')
that would shorten your code segment to
for f in txt_files:
count += sum(1 for line in open(os.path.join(d,f))
if line[0] != '#' and line.strip())

I looks like you want to use a dictionary to keep track of the counts. You could create one a the top like this counts = {}
Then (once you fix your tests) you can update it for each non-comment line:
series_dict = {}
txt_files = [i for i in os.listdir(d) if os.path.splitext(i)[1] == ext]
#selects all files with .txt extension
for f in txt_files:
counts[f] = 0 # create an entry in the dictionary to keep track of one file's lines
with open(os.path.join(d,f)) as file_obj:
series_dict[f] = file_obj.read()
if line.startswith("#"): #Exclude commented lines
continue
elif line.strip(): #Exclude blank lines
counts(f) += 1

Related

Adding a comma to end of first row of csv files within a directory using python

Ive got some code that lets me open all csv files in a directory and run through them removing the top 2 lines of each file, Ideally during this process I would like it to also add a single comma at the end of the new first line (what would have been originally line 3)
Another approach that's possible could be to remove the trailing comma's on all other rows that appear in each of the csvs.
Any thoughts or approaches would be gratefully received.
import glob
path='P:\pytest'
for filename in glob.iglob(path+'/*.csv'):
with open(filename, 'r') as f:
lines = f.read().split("\n")
f.close()
if len(lines) >= 1:
lines = lines[2:]
o = open(filename, 'w')
for line in lines:
o.write(line+'\n')
o.close()
adding a counter in there can solve this:
import glob
path=r'C:/Users/dsqallihoussaini/Desktop/dev_projects/stack_over_flow'
for filename in glob.iglob(path+'/*.csv'):
with open(filename, 'r') as f:
lines = f.read().split("\n")
print(lines)
f.close()
if len(lines) >= 1:
lines = lines[2:]
o = open(filename, 'w')
counter=0
for line in lines:
counter=counter+1
if counter==1:
o.write(line+',\n')
else:
o.write(line+'\n')
o.close()
One possible problem with your code is that you are reading the whole file into memory, which might be fine. If you are reading larger files, then you want to process the file line by line.
The easiest way to do that is to use the fileinput module: https://docs.python.org/3/library/fileinput.html
Something like the following should work:
#!/usr/bin/env python3
import glob
import fileinput
# inplace makes a backup of the file, then any output to stdout is written
# to the current file.
# change the glob..below is just an example.
#
# Iterate through each file in the glob.iglob() results
with fileinput.input(files=glob.iglob('*.csv'), inplace=True) as f:
for line in f: # Iterate over each line of the current file.
if f.filelineno() > 2: # Skip the first two lines
# Note: 'line' has the newline in it.
# Insert the comma if line 3 of the file, otherwise output original line
print(line[:-1]+',') if f.filelineno() == 3 else print(line, end="")
Ive added some encoding as well as mine was throwing a error but encoding fixed that up nicely
import glob
path=r'C:/whateveryourfolderis'
for filename in glob.iglob(path+'/*.csv'):
with open(filename, 'r',encoding='utf-8') as f:
lines = f.read().split("\n")
#print(lines)
f.close()
if len(lines) >= 1:
lines = lines[2:]
o = open(filename, 'w',encoding='utf-8')
counter=0
for line in lines:
counter=counter+1
if counter==1:
o.write(line+',\n')
else:
o.write(line+'\n')
o.close()

Loop through pair of files python

I have a script that receives two files as input and creates a dictionary based on lines. Finally, it overwrites the first file.
I am looking for a way to run this script on all file pairs of a folder, choosing as sys.argv[1] and sys.argv[2] based on a pattern in the name.
import re
import sys
datafile = sys.argv[1]
schemaseqs = sys.argv[2]
datafile_lines = []
d = {}
prev = None
with open(datafile, 'r') as f:
i = 0
for line in f:
if i % 2 == 0:
d[line.strip()]=0
prev = line.strip()
else:
d[prev] = line.strip()
i+=1
new_d = {}
with open(schemaseqs, 'r') as f:
i=0
prev = None
for line in f:
if i % 2 == 0:
new_d[line.strip()]=0
prev = line.strip()
else:
new_d[prev] = line.strip()
i+=1
for key, value in d.items():
if value in new_d:
d[key] = new_d[value]
print(d)
with open(datafile,'w') as filee:
for k,v in d.items():
filee.writelines(k)
filee.writelines('\n')
filee.writelines(v)
filee.writelines('\n')
I have hundreds of file pairs all sharing the same pattern proteinXXXX (where XXXX is a number) This number can have up to four digits (e.g. 9,99,999 or 9999). So I have protein 555.txt and protein 555.fasta
I've seen I can use glob or os.listdir to read files from a directory. However, I cannot assign them to a variable and extract the lines one pair at a time in every pair of the directory.
Any help is appreciated.
Just the concept.
Import required libraries.
import glob
import os.path
Define function that extracts only the basename (the part without extension) from filename.
def basename(fn):
return os.path.splitext(os.path.basename(fn))[0]
Create two sets, one with .txt files, another with .fasta files.
t = {basename(fn) for fn in glob.glob("protein*.txt")}
f = {basename(fn) for fn in glob.glob("protein*.fasta")}
Calculate intersection of these two sets to be sure that both .txt and .fasta files exist with the same basename. Then add the missing suffixes and let them process with the existing code.
for bn in t.intersection(f):
process(bn + ".txt", bn + ".fasta")

Python: Reading multiple files and storing the output for a particular file

I have a thousand .xvg files in a directory which I need to read and store an output for each of them.
Currently I have a python code which works for only one file. Could you please suggest how do I read all files at once and get an output and store it for every file separately?
f = open('/home/abc/xyz/coord/coord_1.xvg')
dat = f.readlines()
dat1 = dat[22:len(dat)]
dat2=[]
for k in dat1:
dat2.append(k.split())
for k in dat2:
if float(k[1])>=9.5:
print('P')
break
elif float(k[1])<=5.9:
print('R')
break
else:
print('N')
Here's a version but used as much as code as possible to make it easier to follow.
import os
def process_files():
" Will process all files in folder using your code "
for file in os.listdir("."): # '.' correspond to the current directory
# You can specify whatever directory,
#such as /usr/accidental_coder/working
if file.endswith(".xvg"):
# Find found
# Output will be with same name but with .txt suffix
with open(os.path.join(".", file), 'r') as infile, \
open(os.path.join(".", file.replace('.xvg', '.txt')), 'w') as ofile:
# Using your original code
# left alone so you could know how to change if desired
# (note: your can be shortened)
dat = infile.readlines()
dat1 = dat[22:len(dat)]
dat2=[]
for k in dat1:
dat2.append(k.split())
for k in dat2:
if float(k[1])>=9.5:
ofile.write('P\n')
break
elif float(k[1])<=5.9:
ofile.write('R\n')
break
else:
ofile.write('N\n')
process_files()
Refactoring Your Code for Better Performance
Seems you just process the 23'rd line in each file
import os
def process_files():
for file in os.listdir("."):
# Examples of getting files from directories
# https://stackoverflow.com/questions/3964681/find-all-files-in-a-directory-with-extension-txt-in-python
if file.endswith(".xvg"):
with open(os.path.join(".", file), 'r') as infile, \
open(os.path.join(".", file.replace('.xvg', '.txt')), 'w') as ofile:
# Skip first 22 lines
for _ in range(22):
next(infile)
# use data on 23rd line
data = next(infile)
k = data.split()
if float(k[1])>=9.5:
ofile.write('P\n')
elif float(k[1])<=5.9:
ofile.write('R\n')
else:
ofile.write('N\n')
process_files()

Creating files with names based on entries in txt file

i = 1
with open("randomStuff\\test\\brief.txt") as textFile:
lines = [line.split('\n') for line in textFile]
for row in lines:
for elem in row:
with open(elem + ".txt", "w") as newLetter:
newLetter.writelines(elem)
i += 1
I have a txt file with names. I want to create files with those names like:
firstnameLastname.txt
The names appear in the files too.
At the moment it is working fine, but it creates on empty file called ".txt"
Can someone tell me why? If I'm right the problem should be in the loops.
Add an if statement to prevent creating files on empty lines
Edit
i = 1
with open("randomStuff\\test\\brief.txt") as textFile:
lines = [line.split('\n') for line in textFile]
for row in lines:
for elem in row:
if elem == “”:
continue
with open(elem + ".txt", "w") as newLetter:
newLetter.writelines(elem)
i += 1
Continue will jump to the next loop cycle without execute the below code
I don't know why you have so many loops:
from pathlib import Path
text_file_content = Path("randomStuff/test/brief.txt").read_text().split_lines()
for line in text_file_content:
if line: # in case you have a new line at the end of your file, which you probably should
with open(f"{line}.txt", "w") as new_letter:
new_letter.writelines(line)

Split a large text file to small ones based on location

Suppose I have a big file as file.txt and it has data of around 300,000. I want to split it based on certain key location. See file.txt below:
Line 1: U0001;POUNDS;**CAN**;1234
Line 2: U0001;POUNDS;**USA**;1234
Line 3: U0001;POUNDS;**CAN**;1234
Line 100000; U0001;POUNDS;**CAN**;1234
The locations are limited to 10-15 different nation. And I need to separate each record of a particular country in one particular file. How to do this task in Python
Thanks for help
This will run with very low memory overhead as it writes each line as it reads it.
Algorithm:
open input file
read a line from input file
get country from line
if new country then open file for country
write the line to country's file
loop if more lines
close files
Code:
with open('file.txt', 'r') as infile:
try:
outfiles = {}
for line in infile:
country = line.split(';')[2].strip('*')
if country not in outfiles:
outfiles[country] = open(country + '.txt', 'w')
outfiles[country].write(line)
finally:
for outfile in outfiles.values():
outfile.close()
with open("file.txt") as f:
content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
text = [x.strip() for x in content]
x = [i.split(";") for i in text]
x.sort(key=lambda x: x[2])
from itertools import groupby
from operator get itemgetter
y = groupby(x, itemgetter(2))
res = [(i[0],[j for j in i[1]]) for i in y]
for country in res:
with open(country[0]+".txt","w") as writeFile:
writeFile.writelines("%s\n" % ';'.join(l) for l in country[1])
will group by your item!
Hope it helps!
Looks like what you have is a csv file. csv stands for comma-separated values, but any file that uses a different delimiter (in this case a semicolon ;) can be treated like a csv file.
We'll use the python module csv to read the file in, and then write a file for each country
import csv
from collections import defaultdict
d = defaultdict(list)
with open('file.txt', 'rb') as f:
r = csv.reader(f, delimiter=';')
for line in r:
d[l[2]].append(l)
for country in d:
with open('{}.txt'.format(country), 'wb') as outfile:
w = csv.writer(outfile, delimiter=';')
for line in d[country]:
w.writerow(line)
# the formatting-function for the filename used for saving
outputFileName = "{}.txt".format
# alternative:
##import time
##outputFileName = lambda loc: "{}_{}.txt".format(loc, time.asciitime())
#make a dictionary indexed by location, the contained item is new content of the file for the location
sortedByLocation = {}
f = open("file.txt", "r")
#iterate each line and look at the column for the location
for l in f.readlines():
line = l.split(';')
#the third field (indices begin with 0) is the location-abbreviation
# make the string lower, cause on some filesystems the file with upper chars gets overwritten with only the elements with lower characters, while python differs between the upper and lower
location = line[2].lower().strip()
#get previous lines of the location and store it back
tmp = sortedByLocation.get(location, "")
sortedByLocation[location]=tmp+l.strip()+'\n'
f.close()
#save file for each location
for location, text in sortedByLocation.items():
with open(outputFileName(location) as f:
f.write(text)

Categories

Resources