Comparing multiple file items using re - python

Currently I have a script that finds all the lines across multiple input files that have something in the format of
Matches: 500 (54.3 %) and prints out the top 10 highest matches in percentage.
I want to be able to have it also output the top 10 lines for score ex: Score: 4000
import re
def get_values_from_file(filename):
f = open(filename)
winpat = re.compile("([\d\.]+)\%")
xinpat = re.compile("[\d]") #ISSUE, is this the right regex for it? Score: 500****
values = []
scores = []
for line in f.readlines():
if line.find("Matches") >=0:
percn = float(winpat.findall(line)[0])
values.append(percn)
elif line.find("Score") >=0:
hey = float(xinpat.findall(line)[0])
scores.append(hey)
return (scores,values)
all_values = []
all_scores = []
for filename in ["out0.txt", "out1.txt"]:#and so on
values = get_values_from_file(filename)
all_values += values
all_scores += scores
all_values.sort()
all_values.reverse()
all_scores.sort() #also for scores
all_scores.reverse()
print(all_values[0:10])
print(all_scores[0:10])
Is my regex for the score format correct? I believe that's where I am having the issue, as it doesn't output both correctly.
Any thoughts? Should I split it into two functions?
Thank you.

Is my regex for the score format correct?
No, it should be r"\d+".
You don't need []. Those brackets establish a character class representing all of the characters inside the brackets. Since you only have one character type inside the bracket, they do nothing.
You only match a single character. You need a * or a + to match a sequence of characters.
You have an unescaped backslash in your string. Use the r prefix to allow the regular expression engine to see the backslash.
Commentary:
If it were me, I'd let the regular expression do all of the work, and skip line.find() altogether:
#UNTESTED
def get_values_from_file(filename):
winpat = re.compile(r"Matches:\s*\d+\s*\(([\d\.]+)\%\)")
xinpat = re.compile(r"Score:\s*([\d]+)")
values = []
scores = []
# Note: "with open() as f" automatically closes f
with open(filename) as f:
# Note: "for line in f" more memory efficient
# than "for line in f.readlines()"
for line in f:
win = winpat.match(line)
xin = xinpat.match(line)
if win: values.append(float(win.group(0)))
if xin: scores.append(float(xin.group(0)))
return (scores,values)
Just for fun, here is a version of the routine which calls re.findall exactly once per file:
# TESTED
# Compile this only once to save time
pat = re.compile(r'''
(?mx) # multi-line, verbose
(?:Matches:\s*\d+\s*\(([\d\.]+)\s*%\)) # "Matches: 300 (43.2%)"
|
(?:Score:\s*(\d+)) # "Score: 4000"
''')
def get_values_from_file(filename):
with open(filename) as f:
values, scores = zip(*pat.findall(f.read()))
values = [float(value) for value in values if value]
scores = [float(score) for score in scores if score]
return scores, values

No. xinpat will only match single digits, so findall() will return a list of single digits, which is a bit messy. Change it to
xinpat = re.compile("[\d]+")
Actually, you don't need the square brackets here, so you could simplify it to
xinpat = re.compile("\d+")
BTW, the names winpat and xinpat are a bit opaque. The pat bit is ok, but win & xin? And hey isn't great either. But I guess xin and hey are just temporary names you made up when you decidd to expand the program.
Another thing I just noticed, you don't need to do
all_values.sort()
all_values.reverse()
You can (and should) do that in one hit:
all_values.sort(reverse=True)

Related

How can I replace a number in a string with another number?

I am trying to replace a number in a string with another number. For instance, I have the string "APU12_24F" and I want to add 7 to the second number to make it "APU12_31F".
Right now I am simply able to locate the number in which I'm interested by using string.split.
I can't figure out how to edit the new strings which this produces.
def main():
f=open("edita15888_debug.txt", "r")
fl = f.readlines()
for x in fl:
if ("APU12" in x):
list_string=split_string(x)
print(list_string);
return
def split_string_APU12(string):
# Split the string based on APU12_
list_string = string.split("APU12_")
return list_string
main()
The output for this makes sense as I'll get something like ['', 24F\n]. I just now need to change the 24 to 31 then put it back into the original string.
Feel free to let me know if there is a better approach to this. I'm very new to python and everything I can find online with the available search/replace functions doesn't seem to do what I'd need them to do. Thank you!
Assuming that pattern is _ + multiple digits you can replace it with regex
import re
re.sub(r"_(\d+)", lambda r: '_'+str(int(r.group(1)) + 7),'APU12_24F')
This isn't generalized because I'm not sure what the rest of the data looks like but maybe something like this should work:
def main():
f=open("edita15888_debug.txt", "r")
fl = f.readlines()
for x in fl:
if ("APU12" in x):
list_string=split_string_APU12(x)
list_string = int(list_string[1].split('F')[0]) + 7
list_string = "APU12_" + str(list_string)
print(list_string)
return
def split_string_APU12(string):
# Split the string based on APU12_
list_string = string.split("APU12_")
return list_string
main()
I'm assuming your strings will be of the format
APU12_##...F
(where ###... means a variable digits number, and F could be any letter, but just one). If so, you could do something like this:
# Notice the use of context managers
# I would recommend learning about this for working with files
with open('edita15888_debug.txt', 'r') as f:
fl = f.readlines()
new_strings = []
for line in fl:
beg, end = line.split('_')
# This splits the end part into number + character
number, char = int(end[:-1]), end[-1]
# Here goes your operation on the number
number += your_quantity # This may be your +7, for example
# Now joining back everything together
new_strings.append(beg + '_' + str(number) + char)
And this would yield you the same list of strings but with the numbers before the last letter modified as you need.
I hope this helps you!
I assumed you need to add seven to a number which goes after an underscore. I hope, this function will be helpful
import re
def add_seven_to_number_after_underscore_in_a_string(aString):
regex = re.compile(r'_(\d+)')
match = regex.search(aString)
return regex.sub('_' + str(int(match.group(1)) + 7), aString)

How to read strings as integers when reading from a file in python

I have the following line of code reading in a specific part of a text file. The problem is these are numbers not strings so I want to convert them to ints and read them into a list of some sort.
A sample of the data from the text file is as follows:
However this is not wholly representative I have uploaded the full set of data here: http://s000.tinyupload.com/?file_id=08754130146692169643 as a text file.
*NSET, NSET=Nodes_Pushed_Back_IB
99915527, 99915529, 99915530, 99915532, 99915533, 99915548, 99915549, 99915550,
99915551, 99915552, 99915553, 99915554, 99915555, 99915556, 99915557, 99915558,
99915562, 99915563, 99915564, 99915656, 99915657, 99915658, 99915659, 99915660,
99915661, 99915662, 99915663, 99915664, 99915665, 99915666, 99915667, 99915668,
99915669, 99915670, 99915885, 99915886, 99915887, 99915888, 99915889, 99915890,
99915891, 99915892, 99915893, 99915894, 99915895, 99915896, 99915897, 99915898,
99915899, 99915900, 99916042, 99916043, 99916044, 99916045, 99916046, 99916047,
99916048, 99916049, 99916050
*NSET, NSET=Nodes_Pushed_Back_OB
Any help would be much appreciated.
Hi I am still stuck with this issue any more suggestions? Latest code and error message is as below Thanks!
import tkinter as tk
from tkinter import filedialog
file_path = filedialog.askopenfilename()
print(file_path)
data = []
data2 = []
data3 = []
flag= False
with open(file_path,'r') as f:
for line in f:
if line.strip().startswith('*NSET, NSET=Nodes_Pushed_Back_IB'):
flag= True
elif line.strip().endswith('*NSET, NSET=Nodes_Pushed_Back_OB'):
flag= False #loop stops when condition is false i.e if false do nothing
elif flag: # as long as flag is true append
data.append([int(x) for x in line.strip().split(',')])
result is the following error:
ValueError: invalid literal for int() with base 10: ''
Instead of reading these as strings I would like each to be a number in a list, i.e [98932850 98932852 98932853 98932855 98932856 98932871 98932872 98932873]
In such cases I use regular expressions together with string methods. I would solve this problem like so:
import re
with open(filepath) as f:
txt = f.read()
g = re.search(r'NSET=Nodes_Pushed_Back_IB(.*)', txt, re.S)
snums = g.group(1).replace(',', ' ').split()
numbers = [int(num) for num in snums]
I read the entire text into txt.
Next I use a regular expression and use the last portion of your header in the text as an anchor, and capture with capturing parenthesis all the rest (the re.S flag means that a dot should capture also newlines). I access all the nubers as one unit of text via g.group(1).
Next. I remove all the commas (actually replace them with spaces) because on the resulting text I use split() which is an excellent function to use on text items that are separated with spaces - it doesn't matter the amount of spaces, it just splits it as you would intent.
The rest is just converting the text to numbers using a list comprehension.
Your line contains more than one number, and some separating characters. You could parse that format by judicious application of split and perhaps strip, or you could minimize string handling by having re extract specifically the fields you care about:
ints = list(map(int, re.findall(r'-?\d+', line)))
This regular expression will find each group of digits, optionally prefixed by a minus sign, and then map will apply int to each such group found.
Using a sample of your string:
strings = ' 98932850, 98932852, 98932853, 98932855, 98932856, 98932871, 98932872, 98932873,\n'
I'd just split the string, strip the commas, and return a list of numbers:
numbers = [ int(s.strip(',')) for s in strings.split() ]
Based on your comment and regarding the larger context of your code. I'd suggest a few things:
from itertools import groupby
number_groups = []
with open('data.txt', 'r') as f:
for k, g in groupby(f, key=lambda x: x.startswith('*NSET')):
if k:
pass
else:
number_groups += list(filter('\n'.__ne__, list(g))) #remove newlines in list
data = []
for group in number_groups:
for str_num in group.strip('\n').split(','):
data.append(int(str_num))

How to read variating data into dictionary?

I need to extract the name of the constants and their corresponding values from a .txt file into a dictionary. Where key = NameOfConstants and Value=float.
The start of the file looks like this:
speed of light 299792458.0 m/s
gravitational constant 6.67259e-11 m**3/kg/s**2
Planck constant 6.6260755e-34 J*s
elementary charge 1.60217733e-19 C
How do I get the name of the constants easy?
This is my attempt:
with open('constants.txt', 'r') as infile:
file1 = infile.readlines()
constants = {i.split()[0]: i.split()[1] for i in file1[2:]}
I'm not getting it right with the split(), and I need a little correction!
{' '.join(line.split()[:-2]):' '.join(line.split()[-2:]) for line in lines}
From your text file I'm unable to get the correct value of no of spaces to split. So below code is designed to help you. Please have a look, it worked for you above stated file.
import string
valid_char = string.ascii_letters + ' '
valid_numbers = string.digits + '.'
constants = {}
with open('constants.txt') as file1:
for line in file1.readlines():
key = ''
for index, char in enumerate(line):
if char in valid_char:
key += char
else:
key = key.strip()
break
value = ''
for char in line[index:]:
if char in valid_numbers:
value += char
else:
break
constants[key] = float(value)
print constants
Have You tried using regular expressions?
for example
([a-z]|\s)*
matches the first part of a line until the digits of the constants begin.
Python provides a very good tutorial on regular expressions (regex)
https://docs.python.org/2/howto/regex.html
You can try out your regex online as well
https://regex101.com/
with open('constants.txt', 'r') as infile:
lines = infile.readlines()
constants = {' '.join(line.split()[:-2]):float(' '.join(line.split()[-2:-1])) for line in lines[2:]}
Since there were two lines above not needed.
This would best be solved using a regexp.
Focussing on your question (how to get the names) and your desires (have something shorter):
import re
# Regular expression fetches all characters
# until the first occurence of a number
REGEXP = re.compile('^([a-zA-Z\s]+)\d.*$')
with open('tst.txt', 'r') as f:
for line in f:
match = REGEXP.match(line)
if match:
# On a match the part between parentheses
# are copied to the first group
name = match.group(1).strip()
else:
# Raise something, or change regexp :)
pass
What about re.split-
import re
lines = open(r"C:\txt.txt",'r').readlines()
for line in lines:
data = re.split(r'\s{3,}',line)
print "{0} : {1}".format(data[0],''.join(data[1:]))
Or use oneliner to make dictionary-
{k:v.strip() for k,v in [(re.split(r'\s{3,}',line)[0],''.join(re.split(r'\s{3,}',line)[1:])) for line in open(r"C:\txt.txt",'r').readlines() ]}
Output-
gravitational constant : 6.67259e-11m**3/kg/s**2
Planck constant : 6.6260755e-34J*s
elementary charge : 1.60217733e-19C
Dictionary-
{'Planck constant': '6.6260755e-34J*s', 'elementary charge': '1.60217733e-19C', 'speed of light': '299792458.0m/s', 'gravitational constant': '6.67259e-11m**3/kg/s**2'}

Splitting lines in a file into string and hex and do operations on the hex values

I have a large file with several lines as given below.I want to read in only those lines which have the _INIT pattern in them and then strip off the _INIT from the name and only save the OSD_MODE_15_H part in a variable. Then I need to read the corresponding hex value, 8'h00 in this case, ans strip off the 8'h from it and replace it with a 0x and save in a variable.
I have been trying strip the off the _INIT,the spaces and the = and the code is becoming really messy.
localparam OSD_MODE_15_H_ADDR = 16'h038d;
localparam OSD_MODE_15_H_INIT = 8'h00
Can you suggest a lean and clean method to do this?
Thanks!
The following solution uses a regular expression (compiled to speed searching up) to match the relevant lines and extract the needed information. The expression uses named groups "id" and "hexValue" to identify the data we want to extract from the matching line.
import re
expression = "(?P<id>\w+?)_INIT\s*?=.*?'h(?P<hexValue>[0-9a-fA-F]*)"
regex = re.compile(expression)
def getIdAndValueFromInitLine(line):
mm = regex.search(line)
if mm == None:
return None # Not the ..._INIT parameter or line was empty or other mismatch happened
else:
return (mm.groupdict()["id"], "0x" + mm.groupdict()["hexValue"])
EDIT: If I understood the next task correctly, you need to find the hexvalues of those INIT and ADDR lines whose IDs match and make a dictionary of the INIT hexvalue to the ADDR hexvalue.
regex = "(?P<init_id>\w+?)_INIT\s*?=.*?'h(?P<initValue>[0-9a-fA-F]*)"
init_dict = {}
for x in re.findall(regex, lines):
init_dict[x.groupdict()["init_id"]] = "0x" + x.groupdict()["initValue"]
regex = "(?P<addr_id>\w+?)_ADDR\s*?=.*?'h(?P<addrValue>[0-9a-fA-F]*)"
addr_dict = {}
for y in re.findall(regex, lines):
addr_dict[y.groupdict()["addr_id"]] = "0x" + y.groupdict()["addrValue"]
init_to_addr_hexvalue_dict = {init_dict[x] : addr_dict[x] for x in init_dict.keys() if x in addr_dict}
Even if this is not what you actually need, having init and addr dictionaries might help to achieve your goal easier. If there are several _INIT (or _ADDR) lines with the same ID and different hexvalues then the above dict approach will not work in a straight forward way.
try something like this- not sure what all your requirements are but this should get you close:
with open(someFile, 'r') as infile:
for line in infile:
if '_INIT' in line:
apostropheIndex = line.find("'h")
clean_hex = '0x' + line[apostropheIndex + 2:]
In the case of "16'h038d;", clean_hex would be "0x038d;" (need to remove the ";" somehow) and in the case of "8'h00", clean_hex would be "0x00"
Edit: if you want to guard against characters like ";" you could do this and test if a character is alphanumeric:
clean_hex = '0x' + ''.join([s for s in line[apostropheIndex + 2:] if s.isalnum()])
You can use a regular expression and the re.findall() function. For example, to generate a list of tuples with the data you want just try:
import re
lines = open("your_file").read()
regex = "([\w]+?)_INIT\s*=\s*\d+'h([\da-fA-F]*)"
res = [(x[0], "0x"+x[1]) for x in re.findall(regex, lines)]
print res
The regular expression is very specific for your input example. If the other lines in the file are slightly different you may need to change it a bit.

Help parsing text file in python

Really been struggling with this one for some time now, i have many text files with a specific format from which i need to extract all the data and file into different fields of a database. The struggle is tweaking the parameters for parsing, ensuring i get all the info correctly.
the format is shown below:
WHITESPACE HERE of unknown length.
K PA DETAILS
2 4565434 i need this sentace as one DB record
2 4456788 and this one
5 4879870 as well as this one, content will vary!
X Max - there sometimes is a line beginning with 'Max' here which i don't need
There is a Line here that i do not need!
WHITESPACE HERE of unknown length.
The tough parts were 1) Getting rid of whitespace, and 2)defining the fields from each other, see my best attempt, below:
dict = {}
XX = (open("XX.txt", "r")).readlines()
for line in XX:
if line.isspace():
pass
elif line.startswith('There is'):
pass
elif line.startswith('Max', 2):
pass
elif line.startswith('K'):
pass
else:
for word in line.split():
if word.startswith('4'):
tmp_PA = word
elif word == "1" or word == "2" or word == "3" or word == "4" or word == "5":
tmp_K = word
else:
tmp_DETAILS = word
cu.execute('''INSERT INTO bugInfo2 (pa, k, details) VALUES(?,?,?)''',(tmp_PA,tmp_K,tmp_DETAILS))
At the minute, i can pull the K & PA fields no problem using this, however my DETAILS is only pulling one word, i need the entire sentance, or at least 25 chars of it.
Thanks very much for reading and I hope you can help! :)
K
You are splitting the whole line into words. You need to split into first word, second word and the rest. Like line.split(None, 2).
It would probably use regular expressions. And use the oposite logic, that is if it starts with number 1 through 5, use it, otherwise pass. Like:
pattern = re.compile(r'([12345])\s+\(d+)\s+\(.*\S)')
f = open('XX.txt', 'r') # No calling readlines; lazy iteration is better
for line in f:
m = pattern.match(line)
if m:
cu.execute('''INSERT INTO bugInfo2 (pa, k, details) VALUES(?,?,?)''',
(m.group(2), m.group(1), m.group(3)))
Oh, and of course, you should be using prepared statement. Parsing SQL is orders of magnitude slower than executing it.
If I understand correctly your file format, you can try this script
filename = 'bug.txt'
f = file(filename,'r')
foundHeaders = False
records = []
for rawline in f:
line = rawline.strip()
if not foundHeaders:
tokens = line.split()
if tokens == ['K','PA','DETAILS']:
foundHeaders = True
continue
else:
tokens = line.split(None,2)
if len(tokens) != 3:
break
try:
K = int(tokens[0])
PA = int(tokens[1])
except ValueError:
break
records.append((K,PA,tokens[2]))
f.close()
for r in records:
print r # replace this by your DB insertion code
This will start reading the records when it encounters the header line, and stop as soon as the format of the line is no longer (K,PA,description).
Hope this helps.
Here is my attempt using re
import re
stuff = open("source", "r").readlines()
whitey = re.compile(r"^[\s]+$")
header = re.compile(r"K PA DETAILS")
juicy_info = re.compile(r"^(?P<first>[\d])\s(?P<second>[\d]+)\s(?P<third>.+)$")
for line in stuff:
if whitey.match(line):
pass
elif header.match(line):
pass
elif juicy_info.match(line):
result = juicy_info.search(line)
print result.group('third')
print result.group('second')
print result.group('first')
Using re I can pull the data out and manipulate it on a whim. If you only need the juicy info lines, you can actually take out all the other checks, making this a REALLY concise script.
import re
stuff = open("source", "r").readlines()
#create a regular expression using subpatterns.
#'first, 'second' and 'third' are our own tags ,
# we could call them Adam, Betty, etc.
juicy_info = re.compile(r"^(?P<first>[\d])\s(?P<second>[\d]+)\s(?P<third>.+)$")
for line in stuff:
result = juicy_info.search(line)
if result:#do stuff with data here just use the tag we declared earlier.
print result.group('third')
print result.group('second')
print result.group('first')
import re
reg = re.compile('K[ \t]+PA[ \t]+DETAILS[ \t]*\r?\n'\
+ 3*'([1-5])[ \t]+(\d+)[ \t]*([^\r\n]+?)[ \t]*\r?\n')
with open('XX.txt') as f:
mat = reg.search(f.read())
for tripl in ((2,1,3),(5,4,6),(8,7,9)):
cu.execute('''INSERT INTO bugInfo2 (pa, k, details) VALUES(?,?,?)''',
mat.group(*tripl)
I prefer to use [ \t] instead of \s because \s matches the following characters:
blank , '\f', '\n', '\r', '\t', '\v'
and I don't see any reason to use a symbol representing more that what is to be matched, with risks to match erratic newlines at places where they shouldn't be
Edit
It may be sufficient to do:
import re
reg = re.compile(r'^([1-5])[ \t]+(\d+)[ \t]*([^\r\n]+?)[ \t]*$',re.MULTILINE)
with open('XX.txt') as f:
for mat in reg.finditer(f.read()):
cu.execute('''INSERT INTO bugInfo2 (pa, k, details) VALUES(?,?,?)''',
mat.group(2,1,3)

Categories

Resources