import sys
import re
x = sys.argv[1]
y = sys.argv[2]
f = open("formula.txt" ,'r')
line = f.read()
match = re.search(r'x',line,re.M|re.I)
match = re.search(r'y',line,re.M|re.I)
f.close()
print x
print y
I tried this above program but I could not get the output?
~
~
Desire output should follows:
when I want execute the above program;
>>>python argument.py circle_area rectangle_area
the output should like this:
x = 2*3.14*r*r
y = l*b
And the given file in program is formula.txt
formula.txt file contains following data;
circle_area = '3.14*r*r'
circle_circumference = '2*3.14*r'
rectangle_area = 'l*b'
rectangle_perimeter = '2(l+b)'
------------------------------------
~
can anybody help me to implement above.
You made so many mistakes in your code.
Don't put variable names inside quotes.
Use capturing groups or lookarounds to match the text you want to print.
Use .group() attribute in re.search function to get the matched text.
Code should look like.
import sys
import re
x = sys.argv[1]
y = sys.argv[2]
f = open("formula.txt" ,'r')
line = f.read()
match1 = re.search(x + r"\s*=\s*'([^']*)'" , line, re.M|re.I).group(1)
match2 = re.search(y + r"\s*=\s*'([^']*)'" , line, re.M|re.I).group(1)
f.close()
print match1
print match2
r"\s*=\s*'([^']*)'", \s* matches zero or more spaces and [^']* matches any character but not of a single quote, zero or more times. This text (value part) was captured into group 1 . Later we refer the captured chars by specifying the index number in group attribute.
First off you don't search argument values, rather do:
match = re.search(r"^%s\s+=\s*'(.*)'" % x, line, re.M|re.I)
Then do something with the match like putting it back into existing variable.
x = match.group(1)
Related
I want to search for multi-line string in a file in python. If there is a match, then I want to get the start line number, end line number, start column and end column number of the match. For example: in the below file,
I want to match the below multi-line string:
pattern = """b'0100000001685c7c35aabe690cc99f947a8172ad075d4401448a212b9f26607d6ec5530915010000006a4730'
b'440220337117278ee2fc7ae222ec1547b3a40fa39a05f91c1e19db60060541c4b3d6e4022020188e1d5d843c'"""
The result of the match should be as: start_line: 2, end_line = 3, start_column: 23 and end_column: 114
The start column is the index in that line where the first character is matched of the pattern and end column is the last index of the line where the last character is matched of the pattern. The end column is shown below:
I tried with the re package of python but it returns None as it could not find any match.
import re
pattern = """b'0100000001685c7c35aabe690cc99f947a8172ad075d4401448a212b9f26607d6ec5530915010000006a4730'
b'440220337117278ee2fc7ae222ec1547b3a40fa39a05f91c1e19db60060541c4b3d6e4022020188e1d5d843c'"""
with open("test.py") as f:
content = f.read()
print(re.search(pattern, content))
I can find the metadata of the location of the match of a single line strings in a file using
with open("test.py") as f:
data = f.read()
for n, line in enumerate(data):
match_index = line.find(pattern)
if match_index != -1:
print("Start Line:", n + 1)
print("End Line", n + 1)
print("Start Column:", match_index)
print("End Column:", match_index + len(pattern) + 1)
break
But, I am struggling to make it work for multi-line strings. How can I match multi-line strings in a file and get the metadata of the location of the match in python?
You should use the re.MULTILINE flag to search multiple lines
import re
pattern = r"(c\nd)"
string = """
a
b
c
d
e
f
"""
match = re.search(pattern, string, flags=re.MULTILINE)
print(match)
To get the start line, you could count the newline characters as follows
start, stop = match.span()
start_line = string[:start].count('\n')
You could do the same for the end_line, or if you know how many lines is your pattern, you can just add this info to avoid counting twice.
To also get the start column, you can check the line itself, or a pure regex solution could also look line:
pattern = "(?:.*\n)*(\s*(c\s*\n\s*d)\s*)"
match = re.match(pattern, string, flags=re.MULTILINE)
start_column = match.start(2) - match.start(1)
start_line = string[:match.start(1)].count('\n')
print(start_line, start_column)
However, I think difflib could be more useful here.
Alternative Solution
Below, I got a more creative solution to your problem:
You are interested in the row and column position of some sample text (not a pattern, but a fixed text) in a larger text.
This problem reminds me a lot on image registration, see https://en.wikipedia.org/wiki/Digital_image_correlation_and_tracking for a short introduction or https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.correlate2d.html for a more sophisticated example.
import os
from itertools import zip_longest
import numpy as np
text = """Some Title
abc xyz ijk
12345678
abcdefgh
xxxxxxxxxxx
012345678
abcabcabc
yyyyyyyyyyy
"""
template = (
"12345678",
"abcdefgh"
)
moving = np.array([
[ord(char) for char in line]
for line in template
])
lines = text.split(os.linesep)
values = [
[ord(char) for char in line]
for line in lines
]
# use zip longest, to pad array with fill value
reference = np.array(list(zip_longest(*values, fillvalue=0))).T
windows = np.lib.stride_tricks.sliding_window_view(reference, moving.shape)
# get a distance matrix
distance = np.linalg.norm(windows - moving, axis=(2, 3))
# find minimum and retrun index location
row, column = np.unravel_index(np.argmin(distance), distance.shape)
print(row, column)
I've seen variations of this question asked a million times but somehow can't figure out a solution for myself.
( PIN 700W_start_stop( STS_PROP( POS_X 1233 )( POS_Y 456 )( BIT_CNT 1 )( CNCT_ID 7071869 ))(USR_PROP( VAR 1( Var_typ -1 )(AssocCd H12 )( termLBLttt +S)( Anorm 011.1)(Amax 1.0))
How do I pull out the number after 'POS_X'? i.e. 1233
I thought I had it figured out using regex because it seems extremely straightforward. But it's not working (go figure).
import re
import pandas as pd
df_pin = pd.DataFrame(columns =
['ID','Pos_x','Pos_y','conn_ID','Association_Code','Anorm','Amax'])
with open(r'C:\Users\user1\Documents\Python Scripts\test1.txt', 'r',
encoding="ISO-8859-1") as txt:
for line in txt:
data = txt.read()
line = line.strip()
x = re.search(r'POS_X (\d+)', data)
df_pin = df_pin.append({'POS_X' : x, ignore_index = True}
print (x)
Shouldn't this give me the numbers after 'POS_X' and then append it do the corresponding column in my dataframe?? There may be multiple occurrences of 'POS_X ###' on the same line, I only want to find the first. What if I wanted to do the same for 'PIN' and extract '700W_start_stop'?
re.search() returns a MatchObject object. \d+ is matched by the first capture group in the regexp, so you need to use
if x:
print(x.group(1))
else:
print("POS_X not found")
to print that.
DEMO
The whole loop should be:
import re
with open(r'C:\Users\user1\Documents\Python Scripts\test1.txt', 'r', encoding="ISO-8859-1") as txt:
for line in txt:
line = line.strip()
x = re.search(r'POS_X (\d+)', line)
if x:
print(x.group(1))
else:
print("POS_X not found in", line)
For PIN, you could use:
x = re.search(r'PIN (\w+)')
\w matches alphanumeric characters and _.
I am making a webscraping tool which gets the amount of players on a game server.
At the moment the most efficient method of doing this is to use Requests and BS4, to write the HTML source to a txt file, then search that file for
" / "
Unfortunately my HTML contains two forward slashed with spaces either side, so I need to be able to do something like
"%d / %d"
So it only gets the one with the integer, unfortunately I do not know the values either side, I just need it to only pick the one an integer in it.
prange = list(range(0, 65))
searchfile = open("data.txt", "r")
for line in searchfile:
if " / " in line:
print (line)
searchfile.close()
Thanks in advance!
You can try using re to find required pattern:
>>> import re
>>> re.search( '(\d+)\s+/\s+(\d+)', 'dsdsd 111 / 222 dsdsds').groups()
('111', '222')
What you want is using regex to search for a specific pattern in your document.
re.search(r'(\d) / (\d)', your_text) will return all occurrences of X / Y where X and Y are 1-digit numbers. If you want more than one digit, you can take a look at the regex syntax, and write something like r'(\d+) / (\d+)'.
With your example, you should have:
prange = list(range(0, 65))
searchfile = open("data.txt", "r")
for line in searchfile:
m = re.search(r'(\d+ / \d+)', line)
if m:
print (line)
searchfile.close()
I have a large file with several lines as given below.I want to read in only those lines which have the _INIT pattern in them and then strip off the _INIT from the name and only save the OSD_MODE_15_H part in a variable. Then I need to read the corresponding hex value, 8'h00 in this case, ans strip off the 8'h from it and replace it with a 0x and save in a variable.
I have been trying strip the off the _INIT,the spaces and the = and the code is becoming really messy.
localparam OSD_MODE_15_H_ADDR = 16'h038d;
localparam OSD_MODE_15_H_INIT = 8'h00
Can you suggest a lean and clean method to do this?
Thanks!
The following solution uses a regular expression (compiled to speed searching up) to match the relevant lines and extract the needed information. The expression uses named groups "id" and "hexValue" to identify the data we want to extract from the matching line.
import re
expression = "(?P<id>\w+?)_INIT\s*?=.*?'h(?P<hexValue>[0-9a-fA-F]*)"
regex = re.compile(expression)
def getIdAndValueFromInitLine(line):
mm = regex.search(line)
if mm == None:
return None # Not the ..._INIT parameter or line was empty or other mismatch happened
else:
return (mm.groupdict()["id"], "0x" + mm.groupdict()["hexValue"])
EDIT: If I understood the next task correctly, you need to find the hexvalues of those INIT and ADDR lines whose IDs match and make a dictionary of the INIT hexvalue to the ADDR hexvalue.
regex = "(?P<init_id>\w+?)_INIT\s*?=.*?'h(?P<initValue>[0-9a-fA-F]*)"
init_dict = {}
for x in re.findall(regex, lines):
init_dict[x.groupdict()["init_id"]] = "0x" + x.groupdict()["initValue"]
regex = "(?P<addr_id>\w+?)_ADDR\s*?=.*?'h(?P<addrValue>[0-9a-fA-F]*)"
addr_dict = {}
for y in re.findall(regex, lines):
addr_dict[y.groupdict()["addr_id"]] = "0x" + y.groupdict()["addrValue"]
init_to_addr_hexvalue_dict = {init_dict[x] : addr_dict[x] for x in init_dict.keys() if x in addr_dict}
Even if this is not what you actually need, having init and addr dictionaries might help to achieve your goal easier. If there are several _INIT (or _ADDR) lines with the same ID and different hexvalues then the above dict approach will not work in a straight forward way.
try something like this- not sure what all your requirements are but this should get you close:
with open(someFile, 'r') as infile:
for line in infile:
if '_INIT' in line:
apostropheIndex = line.find("'h")
clean_hex = '0x' + line[apostropheIndex + 2:]
In the case of "16'h038d;", clean_hex would be "0x038d;" (need to remove the ";" somehow) and in the case of "8'h00", clean_hex would be "0x00"
Edit: if you want to guard against characters like ";" you could do this and test if a character is alphanumeric:
clean_hex = '0x' + ''.join([s for s in line[apostropheIndex + 2:] if s.isalnum()])
You can use a regular expression and the re.findall() function. For example, to generate a list of tuples with the data you want just try:
import re
lines = open("your_file").read()
regex = "([\w]+?)_INIT\s*=\s*\d+'h([\da-fA-F]*)"
res = [(x[0], "0x"+x[1]) for x in re.findall(regex, lines)]
print res
The regular expression is very specific for your input example. If the other lines in the file are slightly different you may need to change it a bit.
I am writing a short script to sanitise folder and file names for upload to SharePoint. Since SharePoint is fussy and has some filename rules beyond simple disallowed characters (multiple consecutive periods are disallowed for instance) it seemed like regular expressions were the way to go rather than simple replacement of single characters. One expression that doesn't seem to be working however is:
[/<>*?|:"~#%&{}\\]+
As a simple character class match I would have expected this to work fine, and it appears to do so in notepad++. My expectation was that a string like
St\r/|ng
with the above regex would match \, / and |. However no matter what I do I can only get the string to match the first backslash, or the first of whatever character is in that class that it comes across. This is being done with the Python re library. Does anyone know what the issue is here?
import os, sys, shutil, re
def cleanPath(path):
#Compiling regex...
multi_dot = re.compile(r"[\.]{2,}")
start_dot = re.compile(r"^[\.]")
end_dot = re.compile(r"[\.]$")
disallowed_chars = re.compile(r'[/<>*?|:"~#%&{}\\]+')
dis1 = re.compile(r'\.files$')
dis2 = re.compile(r'_files$')
dis3 = re.compile(r'-Dateien$')
dis4 = re.compile(r'_fichiers$')
dis5 = re.compile(r'_bestanden$')
dis5 = re.compile(r'_file$')
dis6 = re.compile(r'_archivos$')
dis7 = re.compile(r'-filer$')
dis8 = re.compile(r'_tiedostot$')
dis9 = re.compile(r'_pliki$')
dis10 = re.compile(r'_soubory$')
dis11 = re.compile(r'_elemei$')
dis12 = re.compile(r'_ficheiros$')
dis13 = re.compile(r'_arquivos$')
dis14 = re.compile(r'_dosyalar$')
dis15 = re.compile(r'_datoteke$')
dis16 = re.compile(r'_fitxers$')
dis17 = re.compile(r'_failid$')
dis18 = re.compile(r'_fails$')
dis19 = re.compile(r'_bylos$')
dis20 = re.compile(r'_fajlovi$')
dis21 = re.compile(r'_fitxategiak$')
regxlist = [multi_dot,start_dot,end_dot,disallowed_chars,dis1,dis2,dis3,dis4,dis5,dis5,dis6,dis7,dis8,dis9,dis10,dis11,dis12,dis13,dis14,dis15,dis16,dis17,dis18,dis19,dis20,dis21]
print("************************************\n\n"+path+"\n\n************************************\n")
for x in regxlist:
match = x.search(path)
if match:
print("\n")
print("MATCHED")
print(match.group())
print("___________________________________________________________________________")
return path
#testlist of conditions that should be found, some OK, some bad
testlist = ["string","str....ing","str..ing","str.ing",".string","string.",".string.","$tring",r"st\r\ing","st/r/ing",r"st\r/|ng","/str<i>ng","str.filesing","string.files"]
testlist_ans = ["OK","Match ....","Match ..","OK","Match .","Match .","Match . .","OK",r"Match \ ","Match /",r"Match \/|","Match / < >","OK","Match .files"]
count = 0
for i in testlist:
print(testlist_ans[count])
count = count + 1
cleanPath(i)
What Python re command do you use ?
You should use : re.findall
re.sub(pattern,new_txt,subject) #replace all instinces of pattern with new_txt
re.findall(pattern,subject) #find all instances