Python - regex relation extraction - python

As a part of schoolwork we have been given this code:
>>> IN = re.compile(r'.*\bin\b(?!\b.+ing)')
>>> for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
... for rel in nltk.sem.extract_rels('ORG', 'LOC', doc,
... corpus='ieer', pattern = IN):
... print(nltk.sem.rtuple(rel))
We are asked to try it out with some sentences of our own to see the output, so for this i decided to define a function:
def extract(sentence):
import re
import nltk
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for rel in nltk.sem.extract_rels('ORG', 'LOC', sentence, corpus='ieer', pattern = IN):
print(nltk.sem.rtuple(rel))
When I try and run this code:
>>> from extract import extract
>>> extract("The Whitehouse in Washington")
I get the gollowing error:
Traceback (most recent call last):
File "<pyshell#1>", line 1, in <module>
extract("The Whitehouse in Washington")
File "C:/Python34/My Scripts\extract.py", line 6, in extract
for rel in nltk.sem.extract_rels('ORG', 'LOC', sentence, corpus='ieer', pattern = IN):
File "C:\Python34\lib\site-packages\nltk\sem\relextract.py", line 216, in extract_rels
pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline)
AttributeError: 'str' object has no attribute 'text'
Can anyone help me understand where I am going wrong in my function?
The correct output for the test sentence should be:
[ORG: 'Whitehouse'] 'in' [LOC: 'Washington']

If you see the method definition of extract_rels, it expects the parsed document as third argument.
And here you are passing the sentence. To overcome this error, you can do following :
tagged_sentences = [ nltk.pos_tag(token) for token in tokens]
class doc():
pass
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
doc.headline=["test headline for sentence"]
for i,sent in enumerate(tagged_sentences):
doc.text = nltk.ne_chunk(sent)
for rel in nltk.sem.relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
print(nltk.sem.rtuple(rel) )// you can change it according
Try it out..!!!

Related

nltk corpus tweeter_sample by category

I want to train the nltk with the tweeter_sample corpus, but I get an error when I try to load the sample by category.
First I tried like that:
from nltk.corpus import twitter_samples
documents = [(list(twitter_samples.strings(fileid)), category)
for category in twitter_samples.categories()
for fileid in twitter_samples.fileids(category)]
but it gave me this error:
Traceback (most recent call last):
File "C:/Users/neptun/PycharmProjects/Thesis/First_sentimental.py", line 6, in <module>
for category in twitter_samples.categories()
File "C:\Users\neptun\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\corpus\util.py", line 119, in __getattr__
return getattr(self, attr)
AttributeError: 'TwitterCorpusReader' object has no attribute 'categories'
I don't know how to give them the available attributes in order to have my list with positive and negative sentiment.
If you inspect twitter_samples.fileids(), you'll see that there are separate positive and negative files:
>>> twitter_samples.fileids()
['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']
So to get the tweets classified as positive or negative, just select the corresponding file. It's not the usual way the nltk handles categorized corpora, but there you have it.
documents = ([(t, "pos") for t in twitter_samples.strings("positive_tweets.json")] +
[(t, "neg") for t in twitter_samples.strings("negative_tweets.json")])
This will get you a dataset of 10000 tweets. The third file contains another 20000, which apparently are not categorized.
categorized_tweets = ([(t, "pos") for t in twitter_samples.strings("positive_tweets.json")] +
[(t, "neg") for t in twitter_samples.strings("negative_tweets.json")])
smilies = [':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
'=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
'<3', ':L', ':-/', '>:/', ':S', '>:[', ':#', ':-(', ':[', ':-||', '=L', ':<',
':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
':c', ':{', '>:\\', ';(', '(', ')', 'via']
categorized_tweets_tokens = []
for tweet in categorized_tweets:
text = tweet[0]
for smiley in smilies:
text = re.sub(re.escape(smiley), '', text)
categorized_tweets_tokens.append((word_tokenize(text), tweet[1]))

passing string to sub.re not working in Python

This is so far what has been my progress with this regex function :
import os, re
dpath="/root/tree/def/"
fmatch = re.compile(r'\s+''[\[]+''[A-Z]+''[\]]+')
pmatch = fmatch.match('[FLAC]')
def replace(pmatch,df):
m = re.sub(fmatch,df)
print (m)
def regex(dpath):
for df in os.listdir(dpath):
replace(pmatch, df)
regex (dpath)
First do a for loop and look for files in (dpath), then pass the directory name string to replace(). But I am getting missing argument 'string' error :
root#debian:~# python regex3.py
Traceback (most recent call last):
File "regex3.py", line 18, in <module>
regex (dpath)
File "regex3.py", line 16, in regex
replace(pmatch, df)
File "regex3.py", line 9, in replace
m = re.sub(fmatch,df)
TypeError: sub() missing 1 required positional argument: 'string'
It seems that you want to replace alls all matches of the RegEx \s+[\[]+[A-Z]+[\]]+ to [FLAC]
Make sure you do the following:
def replace(pmatch,df):
m = fmatch.sub('[FLAC]', df)
print (m)
Using #martin-konecny 's Example,
I got this that worked.
Create Files for Example
# Run this in your Shell/Terminal
touch /tmp/abc.FLAC
touch /tmp/abcd.FLAC
Run Python
import re
import os
dpath = '/tmp/'
fmatch = re.compile(r'.+\.FLAC')
pmatch = fmatch.match('[FLAC]')
def replace(pmatch, df):
m = fmatch.sub('[REDACTED]', df)
print(m)
def regex(dpath):
for df in os.listdir(dpath):
replace(pmatch, df)
regex(dpath)
Result:
# ...
# [REDACTED]
# [REDACTED]
# ...
Great if you want to run a search and keep a selection of your results secret.

Python object validation thanks to a Schema

I want to validate a python object thanks to a schema. For this I found the schema framework.
I would like to validate a numeric string:
a = {
'phone_number': '12233'
}
Do you know how can I validate this string thanks to a regex?
At this time, I only know how to perform a string validation:
Schema(str).validate('12')
Schema will call any callables; simply provide a function that uses a regular expression:
import re
pattern = re.compile('^12\d+$')
Schema(And(str, lambda x: pattern.match(x) is not None))
Demo:
>>> import re
>>> from schema import Schema, And
>>> pattern = re.compile('^12\d+$')
>>> s = Schema(And(str, lambda x: pattern.match(x) is not None))
>>> s.validate('123234')
'123234'
>>> s.validate('42')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Users/mj/Development/venvs/stackoverflow-2.7/lib/python2.7/site-packages/schema.py", line 153, in validate
raise SchemaError([None] + x.autos, [e] + x.errors)
schema.SchemaError: <lambda>('42') should evaluate to True

BeautifulSoup doesnt return all HTML

so im trying to extract the value of a line of html that looks like this:
<input type="hidden" name="_ref_ck" value="41d875b47692bb0211ada153004a663f">
and to get the value im doing:
self.ref = soup.find("input",{"name":"_ref_ck"}).get("value")
and its working fine for me but i gave a friend of mine the program to beta and he is getting an error like this:
Traceback (most recent call last):
File "C:\Users\Daniel\AppData\Local\Temp\Rar$DI85.192\Invent Manager.py", line 262, in onOK
self.main = GUI(None, -1, 'Inventory Manager')
File "C:\Users\Daniel\AppData\Local\Temp\Rar$DI85.192\Invent Manager.py", line 284, in __init__
self.inv.Login(log.user)
File "C:\Users\Daniel\AppData\Local\Temp\Rar$DI85.192\Invent Manager.py", line 34, in Login
self.get_ref_ck()
File "C:\Users\Daniel\AppData\Local\Temp\Rar$DI85.192\Invent Manager.py", line 43, in get_ref_ck
self.ref = soup.find('input',{'name':'_ref_ck'}).get("value")
AttributeError: 'NoneType' object has no attribute 'get'
which means that beautifulSoup is returning a NoneType for some reason
so i told him to send me the HTML that the request returns and it was fine then i told him to give me the soup and it only had the the top part of the page and i cant figure out why
this means the BS is returning only part of the html its recieving
my question is why or if there is an easy way i could do this with regex or something else thanks!
Here's a quick pyparsing-based solution walkthrough:
Import HTML parsing helpers from pyparsing
>>> from pyparsing import makeHTMLTags, withAttribute
Define your desired tag expression (makeHTMLTags returns starting and ending tag matching expressions, you just want a starting expression, so we just take the 0'th returned value).
>>> inputTag = makeHTMLTags("input")[0]
Only want input tags having a name attribute = "_ref_ck", use withAttribute to do this filtering
>>> inputTag.setParseAction(withAttribute(name="_ref_ck"))
Now define your sample input, and use the inputTag expression definition to search for a match.
>>> html = '''<input type="hidden" name="_ref_ck" value="41d875b47692bb0211ada153004a663f">'''
>>> tagdata = inputTag.searchString(html)[0]
Call tagdata.dump() to see all parsed tokens and available named results.
>>> print (tagdata.dump())
['input', ['type', 'hidden'], ['name', '_ref_ck'], ['value', '41d875b47692bb0211ada153004a663f'], False]
- empty: False
- name: _ref_ck
- startInput: ['input', ['type', 'hidden'], ['name', '_ref_ck'], ['value', '41d875b47692bb0211ada153004a663f'], False]
- empty: False
- name: _ref_ck
- tag: input
- type: hidden
- value: 41d875b47692bb0211ada153004a663f
- tag: input
- type: hidden
- value: 41d875b47692bb0211ada153004a663f
Use tagdata.value to get the value attribute:
>>> print (tagdata.value)
41d875b47692bb0211ada153004a663f

attribute groups does not belong to object

The following code works as expected if I declare the "line" variable at the beginning of the script. something like ...
s = "Jul 15 12:12:51 whitelist logger: 1|999999999999|id:d9faff7c-4016-4343-b494-37028763bb66 submit date:1307130919 done date:1307130919 stat:DELIVRD err:0|L_VB3_NM_K_P|1373687445|vivnel2|L_VB3_GH_K_P|promo_camp1-bd153424349bc647|1"
When I open a file and loop through lines, the groups attribute does not work. I get an error:AttributeError: 'NoneType' object has no attribute 'groups'
# cat mylast.py
import re
f = open('customer.csv')
for line in f:
logger_re = re.compile(
"logger: ([^ ]+)\
submit date:(\d+)\
done date:(\d+)\
stat:(.+)\
err:(.+)$")
myvalues = logger_re.search(line).groups()
print myvalues
f.close()
Exception:
# python mylast.py
Traceback (most recent call last):
File "mylast.py", line 13, in ?
myvalues = logger_re.search(line).groups()
AttributeError: 'NoneType' object has no attribute 'groups'
Your regular expression is not matching your actual file contents.
As such, logger_re.search(line) returns None.
The problem here is that you indented your regular expression but did not compensate for the extra whitespace:
logger_re = re.compile(
"logger: ([^ ]+)\
submit date:(\d+)\
done date:(\d+)\
stat:(.+)\
err:(.+)$")
Note that the whitespace at the start of the line there matters. Use separate strings (Python will join them at compile time):
logger_re = re.compile(
"logger: ([^ ]+) "
"submit date:(\d+) "
"done date:(\d+) "
"stat:(.+) "
"err:(.+)$")
Your search will return None if no matches were found. You need to check that myvalues is not None before attempting to access groups().

Categories

Resources