Python Line Comparison - python

I have a directory with subdirectories full of files that all contain the line :
VERSION "1.0" # THAT NUMBER VARIES.
So one file as an example of the content would read :
#comments about what the file program does
#
#
#ifndefine _AWS_THIS_READS_STUFF_H
#define _AWS_THIS_READS_STUFF_H
#define AWS_THIS_READS_STUFF_VERSION "1.2" <---this is the line I want
#to compare that is in all
Then the read of the file would be some long program in C that is written.
I used the number to identify when I have made changes. I need to make something in Python that identifies if the numbers before the decimal and after the decimal all match within a directory, since there are hundreds of files.
import glob
import sys
import re
import filecmp
from collections import defaultdict
from itertools import dropwhile
myfiles = glob.glob('*.h')
for filename in myfiles:
print(filename) #this prints all of the file names
def all_same(patt):
r = re.compile('#define\s+([_A-Z]+)_VERSION\s+("\d+\.\d+")$') #REGEX to find
files = glob.glob(patt)
d = defaultdict(list)
for file in files:
with open(file) as f:
version = r.search(next(dropwhile(lambda x: "VERSION" not in x, f)))
d[version].append(file) #search for VERSION LINE
return d
and when it is run in my command prompt it prints nothing out.
ALSO TRIED THIS! - I got it to print out the file and the "0.0" number but now I need to compare!
myfiles = glob.glob('*.h') #reads in all files ending in .h
for file in myfiles:
for line in open(file):
line = line.rstrip()
if re.search('VERSION\s+("\d+\.\d+")$', line):
list = re.findall("\d+\.\d+" , line)
list.append(file)
print(list)
#print (list + ' : ' + root + "/" + myfile)
with open(file) as f:
version = re.findall('VERSION\s+("\d+\.\d+")$', file)
version = re.search(next(dropwhile(lambda x: "VERSION" not in x, f)))
print(version)
Just need to figure out how to compare numbers in the list of "0.0" - Once again, before decimal and after)

If your version line is header of each file you could pull the first returned from iglob and compare that line to the rest with all:
import glob
import re
from collections import defaultdict
def all_same(patt):
r = re.compile("VERSION\s+(\d+\.\d+)")
files = glob.iglob(patt)
d = defaultdict(list)
for file in files:
with open(file) as f:
version = r.search(next(dropwhile(lambda x: "VERSION" not in x, f)))
d[version].append(file)
return d
If you actually want to the names of all the files with different version numbers, I would find the VERSION line in each file and group by the version number in a dict using the version number as the key and appending the file names as the values:
import glob
import re
from collections import defaultdict
from itertools import dropwhile
def all_same(path):
r = re.compile("VERSION\s+(\d+\.\d+)")
files = glob.iglob(path)
d = defaultdict(list)
for file in files:
with open(file) as f:
version = r.search(next(dropwhile(lambda x: "VERSION" not in x, f))).group()
d[version].append(file)
return d

Related

Loop through pair of files python

I have a script that receives two files as input and creates a dictionary based on lines. Finally, it overwrites the first file.
I am looking for a way to run this script on all file pairs of a folder, choosing as sys.argv[1] and sys.argv[2] based on a pattern in the name.
import re
import sys
datafile = sys.argv[1]
schemaseqs = sys.argv[2]
datafile_lines = []
d = {}
prev = None
with open(datafile, 'r') as f:
i = 0
for line in f:
if i % 2 == 0:
d[line.strip()]=0
prev = line.strip()
else:
d[prev] = line.strip()
i+=1
new_d = {}
with open(schemaseqs, 'r') as f:
i=0
prev = None
for line in f:
if i % 2 == 0:
new_d[line.strip()]=0
prev = line.strip()
else:
new_d[prev] = line.strip()
i+=1
for key, value in d.items():
if value in new_d:
d[key] = new_d[value]
print(d)
with open(datafile,'w') as filee:
for k,v in d.items():
filee.writelines(k)
filee.writelines('\n')
filee.writelines(v)
filee.writelines('\n')
I have hundreds of file pairs all sharing the same pattern proteinXXXX (where XXXX is a number) This number can have up to four digits (e.g. 9,99,999 or 9999). So I have protein 555.txt and protein 555.fasta
I've seen I can use glob or os.listdir to read files from a directory. However, I cannot assign them to a variable and extract the lines one pair at a time in every pair of the directory.
Any help is appreciated.
Just the concept.
Import required libraries.
import glob
import os.path
Define function that extracts only the basename (the part without extension) from filename.
def basename(fn):
return os.path.splitext(os.path.basename(fn))[0]
Create two sets, one with .txt files, another with .fasta files.
t = {basename(fn) for fn in glob.glob("protein*.txt")}
f = {basename(fn) for fn in glob.glob("protein*.fasta")}
Calculate intersection of these two sets to be sure that both .txt and .fasta files exist with the same basename. Then add the missing suffixes and let them process with the existing code.
for bn in t.intersection(f):
process(bn + ".txt", bn + ".fasta")

Counting the number of lines of text in all files within a folder and subfolders using Python

I have a huge folder with subfolders and multiple .sql files within those subfolders. I want to get the number of lines of code within every .sql file. This is what I've tried:
import os
import glob
os.chdir("path of folder")
names=[]
for fn in glob.glob("*.sql"):
with open(fn) as f:
names[fn]=sum(1 for line in f if line.strip() and not line.startswith('#'))
print(names)
But the output I get is [ ]. Could you guys help me with where I'm going wrong?
I know how to count the number of lines of code within a single file using "num_lines". I can't do that manually for each file and need to quicken the process.
The following version of you code works for files in the target directory, but not sub-folders:
import os
import glob
os.chdir("foo")
names = {}
for fn in glob.glob("*.sql"):
with open(fn) as f:
names[fn] = sum(1 for line in f if line.strip() and not line.startswith('#'))
print(names)
A version with the newer pathlib works recursively too:
#!/usr/bin/env python3
from pathlib import Path
target = Path("foo")
names = {}
for file in target.glob("**/*.sql"):
with file.open("rt") as f:
names[f.name] = sum(
1 for line in f
if line.strip() and not line.startswith('#')
)
print(names)
try this:
sql_folder_path = "full/path/to/sql/folder"
sql_files = [join(sql_folder_path, f) for f in listdir(sql_folder_path) if isfile(join(sql_folder_path, f)) and f.endswith(".sql")]
files_stats = {}
for file in sql_files:
with open(file) as f:
files_stats[file]=sum(1 for line in f if line.strip() and not line.startswith('#'))
print(files_stats)

Python - Extract Code from Text using Regex

I am a Python beginner and looking for help with an extraction problem.
I have a bunch of textfiles and need to extract all special combinations of an expression ("C"+"exactly 9 numerical digits") and write them to a file including the filename of the textfile. Each occurence of the expression I want to catch start at the beginning of a new line and ends with a "/n".
sample_text = """Some random text here
and here
and here
C123456789
some random text here
C987654321
and here
and here"""
What the output should look like (in the output file)
My_desired_output_file = "filename,C123456789,C987654321"
My code so far:
min_file_size = 5
def list_textfiles(directory, min_file_size): # Creates a list of all files stored in DIRECTORY ending on '.txt'
textfiles = []
for root, dirs, files in os.walk(directory):
for name in files:
filename = os.path.join(root, name)
if os.stat(filename).st_size > min_file_size:
textfiles.append(filename)
for filename in list_textfiles(temp_directory, min_file_size):
string = str(filename)
text = infile.read()
regex = ???
with open(filename, 'w', encoding="utf-8") as outfile:
outfile.write(regex)
your regex is '^C[0-9]{9}$'
^ start of line
C exact match
[0-9] any digit
{9} 9 times
$ end of line
import re
regex = re.compile('(^C\d{9})')
matches = []
with open('file.txt', 'r') as file:
for line in file:
line = line.strip()
if regex.match(line):
matches.append(line)
You can then write this list to a file as needed.
How about:
import re
sample_text = """Some random text here
and here
and here
C123456789
some random text here
C987654321
and here
and here"""
k = re.findall('(C\d{9})',sample_text)
print(k)
This will return all occurrences of that pattern. If you yield line from your text and store your target combination. Something like:
Updated:
import glob
import os
import re
search = {}
os.chdir('/FolderWithTxTs')
for file in glob.glob("*.txt"):
with open(file,'r') as f:
data = [re.findall('(C\d{9})',i) for i in f]
search.update({f.name:data})
print(search)
This would return a dictionary with file names as keys and a list of found matches.

Python script to count num lines in all files in directory

So I'm new to python and I'm trying to write a script that iterates through all .txt files in a directory, counts the number of lines in each one (with exception to lines that are blank or commented out), and writes the final output to a csv. The final output should look something like this:
agprices, avi, adp
132, 5, 8
I'm having trouble with the syntax to save each count as the value of the dictionary. Here is my code below:
#!/usr/bin/env python
import csv
import copy
import os
import sys
#get current working dir, set count, and select file delimiter
d = os.getcwd()
count = 0
ext = '.txt'
#parses through files and saves to a dict
series_dict = {}
txt_files = [i for i in os.listdir(d) if os.path.splitext(i)[1] == ext]
#selects all files with .txt extension
for f in txt_files:
with open(os.path.join(d,f)) as file_obj:
series_dict[f] = file_obj.read()
if line.strip(): #Exclude blank lines
continue
else if line.startswith("#"): #Exclude commented lines
continue
else
count +=1
#Need to save count as val in dict here
#save the dictionary with key/val pairs to a csv
with open('seriescount.csv', 'wb') as f:
w = csv.DictWriter(f, series_dict.keys())
w.writeheader()
w.writerow(series_dict)
So here's the edit:
#!/usr/bin/env python
import csv
import copy
import os
import sys
import glob
#get current working dir, set count, and select file delimiter
os.chdir('/Users/Briana/Documents/Misc./PythonTest')
#parses through files and saves to a dict
series = {}
for fn in glob.glob('*.txt'):
with open(fn) as f:
series[fn] = (1 for line in f if line.strip() and not line.startswith('#'))
print series
#save the dictionary with key/val pairs to a csv
with open('seriescount.csv', 'wb') as f:
w = csv.DictWriter(f, series.keys())
sum(names.values())
I'm getting an indentation error on the 2nd to last line and am not quite sure why? Also, I'm not positive that I'm writing the syntax correctly on the last part. Again, I'm simply trying to return a dictionary with names of files and number of lines in files like {a: 132, b:245, c:13}
You can try something along these lines:
os.chdir(ur_directory)
names={}
for fn in glob.glob('*.txt'):
with open(fn) as f:
names[fn]=sum(1 for line in f if line.strip() and not line.startswith('#'))
print names
That will print a dictionary similar to:
{'test_text.txt': 20, 'f1.txt': 3, 'lines.txt': 101, 'foo.txt': 6, 'dat.txt': 6, 'hello.txt': 1, 'f2.txt': 4, 'neglob.txt': 8, 'bar.txt': 6, 'test_reg.txt': 6, 'mission_sp.txt': 71, 'test_nums.txt': 8, 'test.txt': 7, '2591.txt': 8303}
And you can use that Python dict in csv.DictWriter.
If you want the sum of those, just do:
sum(names.values())
I think you should make two changes to your script:
Use glob.glob() to get the list of files matching your desired suffix
Use for line in file_obj to iterate through the lines
Other problem:
The indentation is wrong on your last few lines
You could count your lines in your files with this 1-liner:
line_nums = sum(1 for line in open(f) if line.strip() and line[0] != '#')
that would shorten your code segment to
for f in txt_files:
count += sum(1 for line in open(os.path.join(d,f))
if line[0] != '#' and line.strip())
I looks like you want to use a dictionary to keep track of the counts. You could create one a the top like this counts = {}
Then (once you fix your tests) you can update it for each non-comment line:
series_dict = {}
txt_files = [i for i in os.listdir(d) if os.path.splitext(i)[1] == ext]
#selects all files with .txt extension
for f in txt_files:
counts[f] = 0 # create an entry in the dictionary to keep track of one file's lines
with open(os.path.join(d,f)) as file_obj:
series_dict[f] = file_obj.read()
if line.startswith("#"): #Exclude commented lines
continue
elif line.strip(): #Exclude blank lines
counts(f) += 1

How do I organize data in alphabetical order?

So I have some pieces of data stored in a folder as .txt, e.g. FRED.txt & BOB.txt, which in the text file contain their 5 random numbers chosen from 1 to 10 and I am stuck as to how I can print their names (in alphabetical order) along with their highest random number. I know that I have to use the glob or os libraries but I don't really know where to go with them.
So far I have this...
import glob, os
dataFile = open("directory_pathway", "r+")
dataFile.read()
# Somehow printing names & highest number here.
dataFile.close()
Any help is much appreciated. Thanks :)
Get only text file from the input directory by glob module.
Use for loop to iterate every text file.
Read file content.
Get max number from the file content.
Add into result dictionary.
Sort dictionary keys and print values.
input: Following contents in FRED.txt file
2
4
6
8
10
code:
import glob
import os
dir_path = "/home/vivek/Desktop/stackoverflow/input"
text_files = glob.glob(dir_path+"/*.txt")
print "Test Files:", text_files
result = {}
for i in text_files:
# Read file content.
with open(i, 'rb') as fp:
data = fp.read()
max_no = max([int(j) for j in data.split()])
result[os.path.basename(i)] = max_no
#- Sort and print File names.
sorted_file_names = sorted(result.keys())
for i in sorted_file_names:
print "File Name: %s, MAx Random Number: %d"%(i, result[i])
output:
Test Files: ['/home/vivek/Desktop/stackoverflow/input/AOB.txt', '/home/vivek/Desktop/stackoverflow/input/ABO.txt', '/home/vivek/Desktop/stackoverflow/input/FRED.txt', '/home/vivek/Desktop/stackoverflow/input/BOB.txt']
File Name: ABO.txt, MAx Random Number: 9
File Name: AOB.txt, MAx Random Number: 9
File Name: BOB.txt, MAx Random Number: 9
File Name: FRED.txt, MAx Random Number: 10
vivek#vivek:~/Desktop/stackoverflow/input$
import glob, os, re
names = []
path = os.path.join('path_to_dir', '*.txt')
for filename in glob.glob(path):
names.append(filename)
names.sort()
for filename in names:
print(re.search(r'\w+.txt', filename).group(0))
text = open(filename, 'r')
data = text.read().split()
print(max(data, key = int), '\n')
text.close()
raw_input()
import os
result_dict = {}
for i in sorted([i for i in os.listdir("/path/to/folder/") if i.endswith(".txt")]):
f = open(i)
a = f.readlines()
num = sorted([int(j.strip()) for j in a])
print num
result_dict[i] = num[-1]
for i,j in sorted(result_dict.items(), key=lambda s: s[0]):
print i,j
sort the file names found with glob, map the contents to int and print the filename f and the max:
import glob
import os
path = "path/"
for f in sorted(glob.glob(os.path.join(path,"*.txt"))):
with open(os.path.join(path, f)) as fl:
print("Filename: {}\nMax value: {}".format(f, max(map(int, fl))))
map returns a map object so we don't need to create a list to find the max, we only store one line/value at a time.
sorted(glob.glob("*.txt")) will get you the list of filenames, sorted. Then iterate over that list, open each file, and print whatever you like.

Categories

Resources