Find a regular expression in between two characters - python

I have a txt file which contains the following line.
<KEY key="Spread" keyvalue="FILENAME">
How can I extract FILENAME from the above using regular expressions
So far I have tried (in my python script):
if '"Spread" keyvalue' in line:
n = re.search(r'\keyvalue="(.*)', line)
name = n.group()
print name
This gives an output of:
keyvalue="FILENAME">
but I only want to output:
FILENAME
What is the regular expression I need?

Change your regex to,
n = re.search(r'\bkeyvalue="(.*?)"', line)
name = n.group(1)
Example:
>>> import re
>>> s = '''<KEY key="Spread" keyvalue="FILENAME">'''
>>> n = re.search(r'\bkeyvalue="(.*?)"', s)
>>> n.group(1)
'FILENAME'
>>>
OR
Use BeautifulSoup.
>>> from bs4 import BeautifulSoup
>>> xml = '''<KEY key="Spread" keyvalue="FILENAME">'''
>>> soup = BeautifulSoup(xml, 'lxml')
>>> s = soup.find('key', attrs={'key':'Spread'})
>>> s.get('keyvalue', None)
'FILENAME'

Another pattern to try:
>>> line = '<KEY key="Spread" keyvalue="FILENAME">'
>>> re.findall('\s+keyvalue=\"([^"]+)\"', line)
['FILENAME']

Try following regex. I'm using lookbehind feature.:
(?<=keyvalue=\").*?(?=\")
Your code should look like:
line = '<KEY key="Spread" keyvalue="FILENAME">'
match = re.search(r"(?<=keyvalue=\").*?(?=\")", line, re.MULTILINE)
if match:
result = match.group()
print(result)
If match is successful, it should print FILENAME.

Related

re.sub isn't matching when it seems it should

any help as to why this regex isnt' matching<td>\n etc? i tested it successfully on pythex.org. Basically i'm just trying to clean up the output so it just says myfile.doc. I also tried (<td>)?\\n\s+(</td>)?
>>> from bs4 import BeautifulSoup
>>> from pprint import pprint
>>> import re
>>> soup = BeautifulSoup(open("/home/user/message_tracking.html"), "html.parser")
>>>
>>> filename = str(soup.findAll("td", text=re.compile(r"\.[a-z]{3,}")))
>>> print filename
[<td>\n myfile.doc\n </td>]
>>> duh = re.sub("(<td>)?\n\s+(</td>)?", '', filename)
>>> print duh
[<td>\n myfile.doc\n </td>]
It's hard to tell without seeing the repr(filename), but I think your problem is the confusing of real newline characters with escaped newline characters.
Compare and contrast the examples below:
>>> pattern = "(<td>)?\n\s+(</td>)?"
>>> filename1 = '[<td>\n myfile.doc\n </td>]'
>>> filename2 = r'[<td>\n myfile.doc\n </td>]'
>>>
>>> re.sub(pattern, '', filename1)
'[myfile.doc]'
>>> re.sub(pattern, '', filename2)
'[<td>\\n myfile.doc\\n </td>]'
If your goal is to just get the stripped string from within the <td> tag you can just let BeautifulSoup do it for you by getting the stripped_strings attribute of a tag:
from bs4 import BeautifulSoup
soup = BeautifulSoup(open("/home/user/message_tracking.html"),"html.parser")
filename_tag = soup.find("td", text=re.compile(r"\.[a-z]{3,}"))) #finds the first td string in the html with specified text
filename_string = filename_tag.stripped_strings
print filename_string
If you want to extract further strings from tags of the same type you can then use findNext to extract the next td tag after the current one:
filename_tag = soup.findNext("td", text=re.compile(r"\.[a-z]{3,}"))) #finds the next td string in the html with specified text after current one
filename_string = filename_tag.stripped_strings
print filename_string
And then loop through...

regex works on pythex but not python2.7, finding unicode representations by regex

I am having a strange regex issue where my regex works on pythex, but not in python itself. I am using 2.7 right now. I want to remove all unicode instances like \x92, of which there are many (like 'Thomas Bradley \x93Brad\x94 Garza',:
import re, requests
def purify(string):
strange_issue = r"""\\t<td><font size=2>G<td><a href="http://facebook.com/KilledByPolice/posts/625590984135709" target=new><font size=2><center>facebook.com/KilledByPolice/posts/625590984135709\t</a><td><a href="http://www.orlandosentinel.com/news/local/lake/os-leesburg-officer-involved-shooting-20130507"""
unicode_chars_rgx = r"[\\][x]\d+"
unicode_matches = re.findall(unicode_chars_rgx, string)
bad_list = [strange_issue]
bad_list.extend(unicode_matches)
for item in bad_list:
string = string.replace(item, "")
return string
name_rgx = r"(?:[<][TDtd][>])|(?:target[=]new[>])(?P<the_deceased>[A-Z].*?)[,]"
urls = {2013: "http://www.killedbypolice.net/kbp2013.html",
2014: "http://www.killedbypolice.net/kbp2014.html",
2015: "http://www.killedbypolice.net/" }
names_of_the_dead = []
for url in urls.values():
response = requests.get(url)
content = response.content
people_killed_by_police_that_year_alone = re.findall(name_rgx, content)
for dead_person in people_killed_by_police_that_year_alone:
names_of_the_dead.append(purify(dead_person))
dead_americans_as_string = ", ".join(names_of_the_dead)
print("RIP, {} since 2013:\n".format(len(names_of_the_dead))) # 3085! :)
print(dead_americans_as_string)
In [95]: unicode_chars_rgx = r"[\\][x]\d+"
In [96]: testcase = "Myron De\x92Shawn May"
In [97]: x = purify(testcase)
In [98]: x
Out[98]: 'Myron De\x92Shawn May'
In [103]: match = re.match(unicode_chars_rgx, testcase)
In [104]: match
How can I get these \x00 characters out? Thank you
Certainly not by trying to find things that look like "\\x00".
If you want to destroy the data:
>>> re.sub('[\x7f-\xff]', '', "Myron De\x92Shawn May")
'Myron DeShawn May'
More work, but tries to preserve the text as well as possible:
>>> import unidecode
>>> unidecode.unidecode("Myron De\x92Shawn May".decode('cp1251'))
"Myron De'Shawn May"

python re module group, how to extract all matching group?

 I have something confuse about the re module.
 Supose I have the following text:
<grp>
<i>i1</i>
<i>i2</i>
<i>i3</i>
...
</grp>
 I use the following re to extract the <i></i> part of the text:
>>> t = "<grp> <i>i1</i> <i>i2</i> <i>i3</i> ... </grp>"
>>> import re
>>> re.match("<grp>.*(<i>.*?</i>).*</grp>", t).group(1)
'<i>i3</i>'
>>>
 I only get the last match items.
 My question is how can extract all the match items using only reg expression? for example: extract <i>i1</i> <i>i2</i> <i>i3</i> in a list ['<i>i1</i>', '<i>i2</i>', '<i>i3</i>']
  Thanks a lot!
You can easily do that using re.findall():
import re
result = re.findall("<i>.*?</i>", t)
>>> print result
['<i>i1</i>', '<i>i2</i>', '<i>i3</i>']
Why don't use an XML parser, like xml.etree.ElementTree from Python standard library:
import xml.etree.ElementTree as ET
data = """
<grp>
<i>i1</i>
<i>i2</i>
<i>i3</i>
</grp>
"""
tree = ET.fromstring(data)
results = tree.findall('.//i')
print [ET.tostring(el).strip() for el in results]
print [el.text for el in results] # if you need just text inside the tags
Prints:
['<i>i1</i>', '<i>i2</i>', '<i>i3</i>']
['i1', 'i2', 'i3']

Python - how to delete all characters in all lines after some sign?

I want to delete all characters in all lines after the # sign.
I wrote some piece of code:
#!/usr/bin/env python
import sys, re, urllib2
url = 'http://varenhor.st/wp-content/uploads/emails.txt'
document = urllib2.urlopen(url)
html = document.read()
html2 = html[0]
for x in html.rsplit('#'):
print x
But it only deletes # sign and copies the rest of characters into next line.
So how I can modify this code, to delete all characters in all lines after #?
Should I use a regex?
You are splitting too many times; use str.rpartition() instead and just ignore the part after #. Do this per line:
for line in html.splitlines():
cleaned = line.rpartition('#')[0]
print cleaned
or, for older Python versions, limit str.rsplit() to just 1 split, and again only take the first result:
for line in html.splitlines():
cleaned = line.rsplit('#', 1)[0]
print cleaned
I used str.splitlines() to cleanly split a text regardless of newline style. You can also loop directly over the urllib2 response file object:
url = 'http://varenhor.st/wp-content/uploads/emails.txt'
document = urllib2.urlopen(url)
for line in document:
cleaned = line.rpartition('#')[0]
print cleaned
Demo:
>>> import urllib2
>>> url = 'http://varenhor.st/wp-content/uploads/emails.txt'
>>> document = urllib2.urlopen(url)
>>> for line in document:
... cleaned = line.rpartition('#')[0]
... print cleaned
...
ADAKorb...
AllisonSarahMoo...
Artemislinked...
BTBottg...
BennettLee...
Billa...
# etc.
You can use Python's slice notation:
import re
import sys
import urllib2
url = 'http://varenhor.st/wp-content/uploads/emails.txt'
document = urllib2.urlopen(url)
html = document.read()
for line in html.splitlines():
at_index = line.index('#')
print line[:at_index]
Since strings are sequences, you can slice them. For instance,
hello_world = 'Hello World'
hello = hello_world[:5]
world = hello_world[6:]
Bear in mind, slicing returns a new sequence and doesn't modify the original sequence.
Since you already imported re, you can use it:
document = urllib2.urlopen(url)
reg_ptn = re.compile(r'#.*')
for line in document:
print reg_ptn.sub('', line)

REGEX parsing commands from latex lines - Python

I'm trying to parse and remove any \command (\textit, etc...) from each line loaded (from .tex file or other commands from lilypond files as [\clef, \key, \time]).
How could I do that?
What I've tried
import re
f = open('example.tex')
lines = f.readlines()
f.close()
pattern = '^\\*([a-z]|[0-9])' # this is the wrong regex!!
clean = []
for line in lines:
remove = re.match(pattern, line)
if remove:
clean.append(remove.group())
print(clean)
Example
Input
#!/usr/bin/latex
\item More things
\subitem Anything
Expected output
More things
Anything
You could use a simple regex substitution using this pattern ^\\[^\s]*:
Sample code in python:
import re
p = re.compile(r"^\\[^\s]*", re.MULTILINE)
str = '''
\item More things
\subitem Anything
'''
subst = ""
print re.sub(p, subst, str)
The result would be:
More things
Anything
This will work:
'\\\w+\s'
It searches for the backslash, then for one or more characters, and a space.

Categories

Resources