AttributeError: 'spacy.tokens.span.Span' object has no attribute 'string' - python

import re
import spacy
from nltk.corpus import stopwords
import pdfplumber
def extract_All_data(path):
text = ""
try:
with pdfplumber.open(path) as pdf:
for i in pdf.pages:
text += i.extract_text()
return text
except:
return None
resume_text = extract_All_data(r"E:\AllResumesPdfs\37202883_Mumbai_6.pdf")
#resume_text =text.lower()
# load pre-trained model
nlp = spacy.load('en_core_web_lg')
# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))
# Education Degrees
EDUCATION = [
'BE','B.E.', 'B.E', 'BS', 'B.S',
'ME', 'M.E', 'M.E.', 'MS', 'M.S', 'M.C.A.',
'BTECH', 'B.TECH', 'M.TECH', 'MTECH',
'SSC', 'HSC', 'CBSE', 'ICSE', 'X', 'XII'
]
def extract_education(resume_text):
nlp_text = nlp(resume_text)
# Sentence Tokenizer
nlp_text = [sent.string.strip() for sent in nlp_text.sents]
edu = {}
# Extract education degree
for index, text in enumerate(nlp_text):
for tex in text.split():
# Replace all special symbols
tex = re.sub(r'[?|$|.|!|,]', r'', tex)
if tex.upper() in EDUCATION and tex not in STOPWORDS:
edu[tex] = text + nlp_text[index + 1]
# Extract year
education = []
for key in edu.keys():
year = re.search(re.compile(r'((20|19)(\d{2}))'), edu[key])
if year:
education.append((key, ''.join(year[0])))
else:
education.append(key)
return education
Education= extract_education(resume_text)
print(Education)
I have download large model but still it showing error for string.
please help me for solve this issue.
Thanks in Advance.
C:\Python37\python.exe E:/JobScan/Sample1.py
2021-05-22 09:33:10.781450: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-05-22 09:33:10.781933: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Traceback (most recent call last):
File "E:/JobScan/Sample1.py", line 58, in <module>
Education= extract_education(resume_text)
File "E:/JobScan/Sample1.py", line 37, in extract_education
nlp_text = [sent.string.strip() for sent in nlp_text.sents]
File "E:/JobScan/Sample1.py", line 37, in <listcomp>
nlp_text = [sent.string.strip() for sent in nlp_text.sents]
AttributeError: 'spacy.tokens.span.Span' object has no attribute 'string'
Process finished with exit code 1
This is console error.

As Tim Roberts said, you want to the text attribute.
# change "string" to "text"
nlp_text = [sent.text.strip() for sent in nlp_text.sents]

Related

Python Spacy KeyError: "[E018] Can't retrieve string for hash

I am trying to make my code run on Raspberry Pi 4 and been stuck on this error for hours. This code segment throws an error on it but runs perfectly on windows with the same project
def create_lem_texts(data): # as a list
def sent_to_words(sentences):
for sentence in sentences:
yield (gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(data))
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
print(os.getcwd())
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
print(os.getcwd())
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
return data_lemmatized
This code is in turned called by this function:
def assign_topics_tweet(tweets):
owd = os.getcwd()
print(owd)
os.chdir('/home/pi/Documents/pycharm_project_twitter/topic_model/')
print(os.getcwd())
lda = LdaModel.load("LDA26")
print(lda)
id2word = Dictionary.load('Id2Word')
print(id2word)
os.chdir(owd)
data = create_lem_texts(tweets)
corpus = [id2word.doc2bow(text) for text in data]
topics = []
for tweet in corpus:
topics_dist = lda.get_document_topics(tweet)
topics.append(topics_dist)
return topics
And here is the error message
Traceback (most recent call last):
File "/home/pi/Documents/pycharm_project_twitter/Twitter_Import.py", line 193, in <module>
main()
File "/home/pi/Documents/pycharm_project_twitter/Twitter_Import.py", line 169, in main
topics = assign_topics_tweet(data)
File "/home/pi/Documents/pycharm_project_twitter/TopicModel.py", line 238, in assign_topics_tweet
data = create_lem_texts(tweets)
File "/home/pi/Documents/pycharm_project_twitter/TopicModel.py", line 76, in create_lem_texts
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
File "/home/pi/Documents/pycharm_project_twitter/TopicModel.py", line 67, in lemmatization
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
File "/home/pi/Documents/pycharm_project_twitter/TopicModel.py", line 67, in <listcomp>
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
File "token.pyx", line 871, in spacy.tokens.token.Token.lemma_.__get__
File "strings.pyx", line 136, in spacy.strings.StringStore.__getitem__
KeyError: "[E018] Can't retrieve string for hash '18446744073541552667'. This usually refers to an issue with the `Vocab` or `StringStore`."
Process finished with exit code 1
I tried reinstalling spacy and the en model, running it directly on pi, spacy versions are the same on both my windows machine and on the Pi. And there is basically no information online on this error
After three days of testing problem was solved by simply installing an older version of Spacy 2.0.1

python crawler ieee paper keywords

i trying to use crawler to get ieee paper keywords but now i get a error
how can to fix my crawler?
my code is here
import requests
import json
from bs4 import BeautifulSoup
ieee_content = requests.get("http://ieeexplore.ieee.org/document/8465981", timeout=180)
soup = BeautifulSoup(ieee_content.text, 'xml')
tag = soup.find_all('script')
for i in tag[9]:
s = json.loads(re.findall('global.document.metadata=(.*;)', i)[0].replace("'", '"').replace(";", ''))
and error is here
Traceback (most recent call last):
File "G:/github/爬蟲/redigg-leancloud/crawlers/sup_ieee_keywords.py", line 90, in <module>
a.get_es_data(offset=0, size=1)
File "G:/github/爬蟲/redigg-leancloud/crawlers/sup_ieee_keywords.py", line 53, in get_es_data
self.get_data(link=ieee_link, esid=es_id)
File "G:/github/爬蟲/redigg-leancloud/crawlers/sup_ieee_keywords.py", line 65, in get_data
s = json.loads(re.findall('global.document.metadata=(.*;)', i)[0].replace(";", '').replace("'", '"'))
IndexError: list index out of range
Here's another answer. I don't know what you are doing with 's' in your code after the load (replace) in my code.
The code below doesn't thrown an error, but again how are you using 's'
import requests
import json
from bs4 import BeautifulSoup
ieee_content = requests.get("http://ieeexplore.ieee.org/document/8465981", timeout=180)
soup = BeautifulSoup(ieee_content.text, 'xml')
tag = soup.find_all('script')
# i is a list
for i in tag[9]:
metadata_format = re.compile(r'global.document.metadata=.*', re.MULTILINE)
metadata = re.findall(metadata_format, i)
if len(metadata) != 0:
# convert the list
convert_to_json = json.dumps(metadata)
x = json.loads(convert_to_json)
s = x[0].replace("'", '"').replace(";", '')
###########################################
# I don't know what you plan to do with 's'
###########################################
print (s)
Apparently in line 65 some of the data provided in i did not suite the regex pattern you're trying to use. Therefor your [0] will not work as the data returned is not an array of suitable length.
Solution:
x = json.loads(re.findall('global.document.metadata=(.*;)', i)
if x:
s = x[0].replace("'", '"').replace(";", ''))

nltk corpus tweeter_sample by category

I want to train the nltk with the tweeter_sample corpus, but I get an error when I try to load the sample by category.
First I tried like that:
from nltk.corpus import twitter_samples
documents = [(list(twitter_samples.strings(fileid)), category)
for category in twitter_samples.categories()
for fileid in twitter_samples.fileids(category)]
but it gave me this error:
Traceback (most recent call last):
File "C:/Users/neptun/PycharmProjects/Thesis/First_sentimental.py", line 6, in <module>
for category in twitter_samples.categories()
File "C:\Users\neptun\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\corpus\util.py", line 119, in __getattr__
return getattr(self, attr)
AttributeError: 'TwitterCorpusReader' object has no attribute 'categories'
I don't know how to give them the available attributes in order to have my list with positive and negative sentiment.
If you inspect twitter_samples.fileids(), you'll see that there are separate positive and negative files:
>>> twitter_samples.fileids()
['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']
So to get the tweets classified as positive or negative, just select the corresponding file. It's not the usual way the nltk handles categorized corpora, but there you have it.
documents = ([(t, "pos") for t in twitter_samples.strings("positive_tweets.json")] +
[(t, "neg") for t in twitter_samples.strings("negative_tweets.json")])
This will get you a dataset of 10000 tweets. The third file contains another 20000, which apparently are not categorized.
categorized_tweets = ([(t, "pos") for t in twitter_samples.strings("positive_tweets.json")] +
[(t, "neg") for t in twitter_samples.strings("negative_tweets.json")])
smilies = [':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
'=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
'<3', ':L', ':-/', '>:/', ':S', '>:[', ':#', ':-(', ':[', ':-||', '=L', ':<',
':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
':c', ':{', '>:\\', ';(', '(', ')', 'via']
categorized_tweets_tokens = []
for tweet in categorized_tweets:
text = tweet[0]
for smiley in smilies:
text = re.sub(re.escape(smiley), '', text)
categorized_tweets_tokens.append((word_tokenize(text), tweet[1]))

Python - regex relation extraction

As a part of schoolwork we have been given this code:
>>> IN = re.compile(r'.*\bin\b(?!\b.+ing)')
>>> for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
... for rel in nltk.sem.extract_rels('ORG', 'LOC', doc,
... corpus='ieer', pattern = IN):
... print(nltk.sem.rtuple(rel))
We are asked to try it out with some sentences of our own to see the output, so for this i decided to define a function:
def extract(sentence):
import re
import nltk
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for rel in nltk.sem.extract_rels('ORG', 'LOC', sentence, corpus='ieer', pattern = IN):
print(nltk.sem.rtuple(rel))
When I try and run this code:
>>> from extract import extract
>>> extract("The Whitehouse in Washington")
I get the gollowing error:
Traceback (most recent call last):
File "<pyshell#1>", line 1, in <module>
extract("The Whitehouse in Washington")
File "C:/Python34/My Scripts\extract.py", line 6, in extract
for rel in nltk.sem.extract_rels('ORG', 'LOC', sentence, corpus='ieer', pattern = IN):
File "C:\Python34\lib\site-packages\nltk\sem\relextract.py", line 216, in extract_rels
pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline)
AttributeError: 'str' object has no attribute 'text'
Can anyone help me understand where I am going wrong in my function?
The correct output for the test sentence should be:
[ORG: 'Whitehouse'] 'in' [LOC: 'Washington']
If you see the method definition of extract_rels, it expects the parsed document as third argument.
And here you are passing the sentence. To overcome this error, you can do following :
tagged_sentences = [ nltk.pos_tag(token) for token in tokens]
class doc():
pass
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
doc.headline=["test headline for sentence"]
for i,sent in enumerate(tagged_sentences):
doc.text = nltk.ne_chunk(sent)
for rel in nltk.sem.relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
print(nltk.sem.rtuple(rel) )// you can change it according
Try it out..!!!

Python NLTK: SyntaxError: Non-ASCII character '\xc3' in file (Sentiment Analysis -NLP)

I am playing around with NLTK to do an assignment on sentiment analysis. I am using Python 2.7. NLTK 3.0 and NumPy1.9.1 version.
This is the code :
__author__ = 'karan'
import nltk
import re
import sys
def main():
print("Start");
# getting the stop words
stopWords = open("english.txt","r");
stop_word = stopWords.read().split();
AllStopWrd = []
for wd in stop_word:
AllStopWrd.append(wd);
print("stop words-> ",AllStopWrd);
# sample and also cleaning it
tweet1= 'Love, my new toyí ½í¸í ½í¸#iPhone6. Its good https://twitter.com/Sandra_Ortega/status/513807261769424897/photo/1'
print("old tweet-> ",tweet1)
tweet1 = tweet1.lower()
tweet1 = ' '.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet1).split())
print(tweet1);
tw = tweet1.split()
print(tw)
#tokenize
sentences = nltk.word_tokenize(tweet1)
print("tokenized ->", sentences)
#remove stop words
Otweet =[]
for w in tw:
if w not in AllStopWrd:
Otweet.append(w);
print("sans stop word-> ",Otweet)
# get taggers for neg/pos/inc/dec/inv words
taggers ={}
negWords = open("neg.txt","r");
neg_word = negWords.read().split();
print("ned words-> ",neg_word)
posWords = open("pos.txt","r");
pos_word = posWords.read().split();
print("pos words-> ",pos_word)
incrWords = open("incr.txt","r");
inc_word = incrWords.read().split();
print("incr words-> ",inc_word)
decrWords = open("decr.txt","r");
dec_word = decrWords.read().split();
print("dec wrds-> ",dec_word)
invWords = open("inverse.txt","r");
inv_word = invWords.read().split();
print("inverse words-> ",inv_word)
for nw in neg_word:
taggers.update({nw:'negative'});
for pw in pos_word:
taggers.update({pw:'positive'});
for iw in inc_word:
taggers.update({iw:'inc'});
for dw in dec_word:
taggers.update({dw:'dec'});
for ivw in inv_word:
taggers.update({ivw:'inv'});
print("tagger-> ",taggers)
print(taggers.get('little'))
# get parts of speech
posTagger = [nltk.pos_tag(tw)]
print("posTagger-> ",posTagger)
main();
This is the error that I am getting when running my code:
SyntaxError: Non-ASCII character '\xc3' in file C:/Users/karan/PycharmProjects/mainProject/sentiment.py on line 19, but no encoding declared; see http://www.python.org/peps/pep-0263.html for details
How do I fix this error?
I also tried the code using Python 3.4.2 and with NLTK 3.0 and NumPy 1.9.1 but then I get the error:
Traceback (most recent call last):
File "C:/Users/karan/PycharmProjects/mainProject/sentiment.py", line 80, in <module>
main();
File "C:/Users/karan/PycharmProjects/mainProject/sentiment.py", line 72, in main
posTagger = [nltk.pos_tag(tw)]
File "C:\Python34\lib\site-packages\nltk\tag\__init__.py", line 100, in pos_tag
tagger = load(_POS_TAGGER)
File "C:\Python34\lib\site-packages\nltk\data.py", line 779, in load
resource_val = pickle.load(opened_resource)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xcb in position 0: ordinal not in range(128)
Add the following to the top of your file # coding=utf-8
If you go to the link in the error you can seen the reason why:
Defining the Encoding
Python will default to ASCII as standard encoding if no other
encoding hints are given.
To define a source code encoding, a magic comment must
be placed into the source files either as first or second
line in the file, such as:
# coding=

Categories

Resources