I have to extract information from PDF documents, all documents have the same structure. I use the following regular expressions:
regex_objetivos = r"Objetivo([\s\S]*)(?=3\s*\.\s*Justi)"
regex_claves = r"Palabras\s+clave([\s\S]*?)(?:Intro|Introduc|1\s*.)"
regex_resumen = r"Resumen([\s\S]*?)(?=\s*Palabras\s*clave)"
regex_directores =r"Directores:\s*([\s\S]*?)(?:\n|\r\n?)" here
The way I extract is as follows:
if not b_resumen:
match = re.search(regex_resumen, text)
if match:
b_resumen = True
resumen = match.group(1).strip()
else:
resumen = "no encontrado"
if not b_claves:
b_claves = True
match = re.search(regex_claves, text)
#print(text)
if match:
claves = match.group(1).strip()
else:
claves = "no encontrado"
if not b_directores:
match = re.search(regex_directores, text)
if match:
b_directores = True
directores = match.group(1).split(',')
else:
directores = ["no encontrado"]
if not b_objetivos:
match = re.search(regex_objetivos, text)
if match:
b_objetivos = True
objetivos = match.group(1)
break
else:
match = re.search(r"Objetivo([\s\S]*)(?=$)", text)
if match:
b_objetivos = True
objetivos = match.group(1)
break
else:
objetivos = "no encontrado"
My question is, if the documents always have the same structure, is there a way to optimize my match in such a way that the searches are sequential without searching from 0?
I try to optimize my code so that the automation time is less
Related
I am using expand contraction in natural language processing. So I made function of Expand contraction.
from contractions import contractions_dict
def expand_contractions(text,contraction_mapping=contractions_dict):
contractions_pattern=re.compile('({})'.format('|'.join
(contraction_mapping.keys())),flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match=contraction.group(0)
first_char=match[0]
expanded_contraction=contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction=first_char+expanded_contraction[1:]
return expanded_contraction
expanded_text=contractions_pattern.sub(expand_match,str(text))
expanded_text=re.sub("'"," ",expanded_text)
return expanded_text
When I tried
expand_contractions("I'll be a son")
then it worked fine. But when i tried
expand_contractions("I'll be a daughter")
then it showed error i.e
NoneType object is not subscriptable.
so I am not able to understand what is the problem with daughter word.
Please help me out.
Something in contraction mapping is wrong.
Fixed it inserting try except.
def expand_contractions(text, contraction_mapping=contractions_dict):
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE | re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match) \
if contraction_mapping.get(match) \
else contraction_mapping.get(match.lower())
expanded_contraction = first_char + expanded_contraction[1:]
return expanded_contraction
try:
expanded_text = contractions_pattern.sub(expand_match, text)
expanded_text = re.sub("'", "", expanded_text)
except:
return text
return expanded_text
I'm trying to properly set up Spacy to work with my Pyspark code. I'm simply trying to use Spacy's functionalities such as extracting the POS for the various texts, Lemma, and etc. the problem is that its failing.
I'm running on windows 10 with python 3.7.0 and pyspark V2.4.3. The technique i followed was to use a user defined function(UDF) similar to this stack overflow link by user chris. The idea is to loop through the body of text and to separate it into chunks and pass each chunk to the UDF function to run Spacy.
def spacy_preprocessing_udf(text, lowercase, remove_stop):
print("inside spacy_preprocessing")
def preprocess(text):
lowercase = True
remove_stop = True
arry = ["testing"]
#print("inside preprocess")
#print(text)
global nlp
try:
#print("inside try")
stops = spacy.lang.en.stop_words.STOP_WORDS
if lowercase:
text = text.lower()
text = nlp(text)
#print("after doc")
#print("before loop try")
teststt = ""
listofword_pos = list()
for word in text:
lemma = word.lemma_.strip()
if lemma:
if not remove_stop or (remove_stop and lemma not in stops):
lemnlp = nlp(lemma)
lemmaPOS = "dum"
for leword in lemnlp:
lemmaPOS = leword.pos_.strip()
# tuplpair = (lemma, lemmaPOS)
testst = lemma + " : " + lemmaPOS
print(testst)
teststt = testst
listofword_pos.append(testst)
return listofword_pos
except:
#print("inside except")
nlp = spacy.load("en")
stops = spacy.lang.en.stop_words.STOP_WORDS
if lowercase:
text = text.lower()
text = nlp(text)
listofword_pos = list()
teststt = ""
#print("before loop")
for word in text:
lemma = word.lemma_.strip()
if lemma:
if not remove_stop or (remove_stop and lemma not in stops):
lemnlp = nlp(lemma)
lemmaPOS = "dum"
for leword in lemnlp:
lemmaPOS = leword.pos_.strip()
#tuplpair = (lemma, lemmaPOS)
testst = lemma + " : " + lemmaPOS
print(testst)
teststt = testst
listofword_pos.append(testst)
print(len(listofword_pos))
return listofword_pos
res_udf = F.udf(preprocess(text), ArrayType(StringType()))
return res_udf
loop:
spacy_result = spacy_preprocessing_udf(text_list[index], True, True)
spacylist = spacylist.append(spacy_result)
The thing is that the code successfully gets inside the Spacy method and even initiates Spacy and calls all those various Spacy methods(POS,Lemma,etc). when debugging i see examples such as "run : verb" which is correct. The issue is that its not properly returning the final List. I get the following error
TYpeError: Invalid function: not a function or callable (call is not defined): class 'list'
which originates when its time to return the list in the UDF. Any help will be greatly appreciated.
Thanks.
I am working on Stock predicting project.I want to download historical data from yahoo finance and save them in CSV format.
Since I am beginner in Python I am unable to correct the error.
My code is as follows:
import re
import urllib2
import calendar
import datetime
import getopt
import sys
import time
crumble_link = 'https://finance.yahoo.com/quote/{0}/history?p={0}'
crumble_regex = r'CrumbStore":{"crumb":"(.*?)"}'
cookie_regex = r'Set-Cookie: (.*?); '
quote_link = 'https://query1.finance.yahoo.com/v7/finance/download/{}?period1={}&period2={}&interval=1d&events=history&crumb={}'
def get_crumble_and_cookie(symbol):
link = crumble_link.format(symbol)
response = urllib2.urlopen(link)
match = re.search(cookie_regex, str(response.info()))
cookie_str = match.group(1)
text = response.read()
match = re.search(crumble_regex, text)
crumble_str = match.group(1)
return crumble_str, cookie_str
def download_quote(symbol, date_from, date_to):
time_stamp_from = calendar.timegm(datetime.datetime.strptime(date_from, "%Y-%m-%d").timetuple())
time_stamp_to = calendar.timegm(datetime.datetime.strptime(date_to, "%Y-%m-%d").timetuple())
attempts = 0
while attempts < 5:
crumble_str, cookie_str = get_crumble_and_cookie(symbol)
link = quote_link.format(symbol, time_stamp_from, time_stamp_to, crumble_str)
#print link
r = urllib2.Request(link, headers={'Cookie': cookie_str})
try:
response = urllib2.urlopen(r)
text = response.read()
print "{} downloaded".format(symbol)
return text
except urllib2.URLError:
print "{} failed at attempt # {}".format(symbol, attempts)
attempts += 1
time.sleep(2*attempts)
return ""
if __name__ == '__main__':
print get_crumble_and_cookie('KO')
from_arg = "from"
to_arg = "to"
symbol_arg = "symbol"
output_arg = "o"
opt_list = (from_arg+"=", to_arg+"=", symbol_arg+"=")
try:
options, args = getopt.getopt(sys.argv[1:],output_arg+":",opt_list)
except getopt.GetoptError as err:
print err
for opt, value in options:
if opt[2:] == from_arg:
from_val = value
elif opt[2:] == to_arg:
to_val = value
elif opt[2:] == symbol_arg:
symbol_val = value
elif opt[1:] == output_arg:
output_val = value
print "downloading {}".format(symbol_val)
text = download_quote(symbol_val, from_val, to_val)
with open(output_val, 'wb') as f:
f.write(text)
print "{} written to {}".format(symbol_val, output_val)
And the Error message that I am getting is :
File "C:/Users/Murali/PycharmProjects/generate/venv/tcl/generate2.py", line
49, in <module>
print get_crumble_and_cookie('KO')
File "C:/Users/Murali/PycharmProjects/generate/venv/tcl/generate2.py", line
19, in get_crumble_and_cookie
cookie_str = match.group(1)
AttributeError: 'NoneType' object has no attribute 'group'
So how can we resolve this problem that has popped up?
Look at these two commands:
match = re.search(cookie_regex, str(response.info()))
cookie_str = match.group(1)
The first one takes the string response.info() does a regular expression search to match cookie_regex. Then match.group(1) is supposed to take the match from it. The problem however is that if you do a print match in between these commands, you'll see that the re.search() returned nothing. This means match.group() has nothing to "group", which is why it errors out.
If you take a closer look at response.info() (you could just add a print response.info() command in your script to see it), you'll see that there's a line in response code that starts with "set-cookie:", the code after which you're trying to capture. However, you have your cookie_regex string set to look for a line with "Set-Cookie:". Note the capital letters. When I change that string to all lower-case, the error goes away:
cookie_regex = r'set-cookie: (.*?); '
I did run into another error after that, where print "downloading {}".format(symbol_val) stops because symbol_val hasn't been defined. It seems that this variable is only declared and assigned when opt[2:] == symbol_arg:. So you may want to rewrite that part to cover all cases.
s = '^^^# """#$ raw data &*823ohcneuj^^^ Important Information ^^^raw data^^^ Imp Info'
In it, I want to remove texts between the delimiters ^^^ and ^^^.
The output should be "Important Information Imp Info"
You can do this with regular expressions:
import re
s = '^^^# """#$ raw data &*823ohcneuj^^^ Important Information ^^^raw data^^^ Imp Info'
important = re.compile(r'\^\^\^.*?\^\^\^').sub('', s)
The key elements in this regular expression are:
escape the ^ charater since it has special meaning
use the ungreedy match of .*?
def removeText(text):
carrotCount = 0
newText = ""
for char in text:
if(char == '^'):
# Reset if we have exceeded 2 sets of carrots
if(carrotCount == 6):
carrotCount = 1
else:
carrotCount += 1
# Check if we have reached the first '^^^'
elif(carrotCount == 3):
# Ignore everything between the carrots
if(char != '^'):
continue;
# Add the second set of carrots when we find them
else:
carrotCount += 1
# Check if we have reached the end of the second ^^^
# If we have, we have the message
elif(carrotCount == 6):
newText += char
return newText
This will print "Important Information Imp Info."
I am wondering, how could I make an algorithm that parses a string for the hashtag symbol ' # ' and returns the full string, but where ever a word starts with a '#' symbol, it becomes a link. I am using python with Google app engine: webapp2 and Jinja2 and I am building a blog.
Thanks
A more efficient and complete way to find the "hashwords":
import functools
def hash_position(string):
return string.find('#')
def delimiter_position(string, delimiters):
positions = filter(lambda x: x >= 0, map(lambda delimiter: string.find(delimiter), delimiters))
try:
return functools.reduce(min, positions)
except TypeError:
return -1
def get_hashed_words(string, delimiters):
maximum_length = len(string)
current_hash_position = hash_position(string)
string = string[current_hash_position:]
results = []
counter = 0
while current_hash_position != -1:
current_delimiter_position = delimiter_position(string, delimiters)
if current_delimiter_position == -1:
results.append(string)
else:
results.append(string[0:current_delimiter_position])
# Update offsets and the haystack
string = string[current_delimiter_position:]
current_hash_position = hash_position(string)
string = string[current_hash_position:]
return results
if __name__ == "__main__":
string = "Please #clarify: What do you #mean with returning somthing as a #link. #herp"
delimiters = [' ', '.', ',', ':']
print(get_hashed_words(string, delimiters))
Imperative code with updates of the haystack looks a little bit ugly but hey, that's what we get for (ab-)using mutable variables.
And I still have no idea what do you mean with "returning something as a link".
Hope that helps.
not sure where do you get the data for the link, but maybe something like:
[('%s' % word) for word in input.split() if word[0]=='#']
Are you talking about twitter? Maybe this?
def get_hashtag_link(hashtag):
if hashtag.startswith("#"):
return '%s' % (hashtag[1:], hashtag)
>>> get_hashtag_link("#stackoverflow")
'#stackoverflow'
It will return None if hashtag is not a hashtag.