parsing twitter feed using python and pydev - python
using the code from here i get a error -http://sentdex.com/sentiment-analysisbig-data-and-python-tutorials-algorithmic-trading/how-to-parse-twitter-code-and-tutorial/
the code is
import re
from re import sub
import time
import cookielib
from cookielib import CookieJar
import urllib2
from urllib2 import urlopen
import difflib
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
startingLink = ‘https://twitter.com/search/realtime?q=’
def twitParser():
oldTwit = [] newTwit = []
howSimAr = [.5,.5,.5,.5,.5]
while 1 < 2:
try:
sourceCode = opener.open(‘https://twitter.com/search/realtime?q=’+keyWord+‘& src=hash’).read()
splitSource = re.findall(r’<p class=”js-tweet-text tweet-text”>(.*?)</p>’,sourceCode)
for item in splitSource:
print ”
print ”
print ”
print ‘ __________________________’
aTweet = re.sub(r’<.*?>’,”,item)
print aTweet
newTwit.append(aTweet)
comparison = difflib.SequenceMatcher(None, newTwit, oldTwit)
howSim = comparison.ratio()
print ‘#############’
print ‘This selection is’,howSim,‘similar to the past’
howSimAr.append(howSim)
howSimAr.remove(howSimAr[0])
waitMultiplier = reduce(lambda x, y: x+y, howSimAr)/len(howSimAr)
print ”
print ‘The current similarity array:’,howSimAr
print ‘Our current Multiplier:’, waitMultiplier
print ‘###############’
oldTwit = [None]
for eachItem in newTwit:
oldTwit.append(eachItem)
newTwit = [None]
time.sleep(waitMultiplier*45)
except Exception, e:
print str(e)
print ‘error in the main try’
time.sleep(555)
twitParser()
i get the error-
File "C:\Users\thisismypc\workspace\hithere\hithere", line 16
SyntaxError: Non-ASCII character '\xe2' in file C:\Users\thisismypc\workspace\hithere\hithere on line 16, but no encoding declared;
see http://python.org/dev/peps/pep-0263/ for details
It looks like you are using the ‘ and ” characters, that are truly Non-ASCII characters. Try substituing them for ' and ", respectively.
Related
Traceback <module> from extractdocx import * <module> from docx import opendocx, getdocumenttext from exceptions import PendingDeprecationWarning
Traceback (most recent call last): File "C:\xampp\htdocs\Plag\scripts\main.py", line 8, in from extractdocx import * File "C:\xampp\htdocs\Plag\scripts\extractdocx.py", line 18, in from docx import opendocx, getdocumenttext File "C:\Users\zeesh\AppData\Local\Programs\Python\Python39\lib\site-packages\docx.py", line 30, in from exceptions import PendingDeprecationWarning ModuleNotFoundError: No module named 'exceptions' # -*- coding: utf-8 -*- # Master script for the plagiarism-checker # Coded by: Shashank S Rao #import other modules from cosineSim import * from htmlstrip import * from extractdocx import * #import required modules import codecs import traceback import sys import operator import urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse import json as simplejson # Given a text string, remove all non-alphanumeric # characters (using Unicode definition of alphanumeric). def getQueries(text,n): import re sentenceEnders = re.compile('[.!?]') sentenceList = sentenceEnders.split(text) sentencesplits = [] for sentence in sentenceList: x = re.compile(r'\W+', re.UNICODE).split(sentence) x = [ele for ele in x if ele != ''] sentencesplits.append(x) finalq = [] for sentence in sentencesplits: l = len(sentence) l=l/n index = 0 for i in range(0,l): finalq.append(sentence[index:index+n]) index = index + n-1 if index !=len(sentence): finalq.append(sentence[len(sentence)-index:len(sentence)]) return finalq # Search the web for the plagiarised text # Calculate the cosineSimilarity of the given query vs matched content on google # This is returned as 2 dictionaries def searchWeb(text,output,c): try: text = text.encode('utf-8') except: text = text query = urllib.parse.quote_plus(text) if len(query)>60: return output,c #using googleapis for searching web base_url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=' url = base_url + '%22' + query + '%22' request = urllib.request.Request(url,None,{'Referer':'Google Chrome'}) response = urllib.request.urlopen(request) results = simplejson.load(response) try: if ( len(results) and 'responseData' in results and 'results' in results['responseData'] and results['responseData']['results'] != []): for ele in results['responseData']['results']: Match = results['responseData']['results'][0] content = Match['content'] if Match['url'] in output: #print text #print strip_tags(content) output[Match['url']] = output[Match['url']] + 1 c[Match['url']] = (c[Match['url']]*(output[Match['url']] - 1) + cosineSim(text,strip_tags(content)))/(output[Match['url']]) else: output[Match['url']] = 1 c[Match['url']] = cosineSim(text,strip_tags(content)) except: return output,c return output,c # Use the main function to scrutinize a file for # plagiarism def main(): # n-grams N VALUE SET HERE n=9 if len(sys.argv) <3: print ("Usage: python main.py <input-filename>.txt <output-filename>.txt") sys.exit() if sys.argv[1].endswith(".docx"): t = docxExtract(sys.argv[1]) else: t=open(sys.argv[1],'r') if not t: print ("Invalid Filename") print ("Usage: python main.py <input-filename>.txt <output-filename>.txt") sys.exit() t=t.read() queries = getQueries(t,n) q = [' '.join(d) for d in queries] found = [] #using 2 dictionaries: c and output #output is used to store the url as key and number of occurences of that url in different searches as value #c is used to store url as key and sum of all the cosine similarities of all matches as value output = {} c = {} i=1 count = len(q) if count>100: count=100 for s in q[:100]: output,c=searchWeb(s,output,c) msg = "\r"+str(i)+"/"+str(count)+"completed..." sys.stdout.write(msg); sys.stdout.flush() i=i+1 #print ("\n") f = open(sys.argv[2],"w") for ele in sorted(iter(c.items()),key=operator.itemgetter(1),reverse=True): f.write(str(ele[0])+" "+str(ele[1]*100.00)) f.write("\n") f.close() print ("\nDone!") if __name__ == "__main__": try: main() except: #writing the error to stdout for better error detection error = traceback.format_exc() print(("\nUh Oh!\n"+"Plagiarism-Checker encountered an error!:\n"+error)) ```
docx, last release was in 2014. The code imports module exceptions that was a top-level module in Python 2.7 but was removed in Python 3: $ python2.7 -c "import exceptions" $ python3.7 -c "import exceptions" Traceback (most recent call last): File "<string>", line 1, in <module> ModuleNotFoundError: No module named 'exceptions' The bottom line: the package is only for Python 2. Use Python 2.7 or find a different package.
python crawler ieee paper keywords
i trying to use crawler to get ieee paper keywords but now i get a error how can to fix my crawler? my code is here import requests import json from bs4 import BeautifulSoup ieee_content = requests.get("http://ieeexplore.ieee.org/document/8465981", timeout=180) soup = BeautifulSoup(ieee_content.text, 'xml') tag = soup.find_all('script') for i in tag[9]: s = json.loads(re.findall('global.document.metadata=(.*;)', i)[0].replace("'", '"').replace(";", '')) and error is here Traceback (most recent call last): File "G:/github/爬蟲/redigg-leancloud/crawlers/sup_ieee_keywords.py", line 90, in <module> a.get_es_data(offset=0, size=1) File "G:/github/爬蟲/redigg-leancloud/crawlers/sup_ieee_keywords.py", line 53, in get_es_data self.get_data(link=ieee_link, esid=es_id) File "G:/github/爬蟲/redigg-leancloud/crawlers/sup_ieee_keywords.py", line 65, in get_data s = json.loads(re.findall('global.document.metadata=(.*;)', i)[0].replace(";", '').replace("'", '"')) IndexError: list index out of range
Here's another answer. I don't know what you are doing with 's' in your code after the load (replace) in my code. The code below doesn't thrown an error, but again how are you using 's' import requests import json from bs4 import BeautifulSoup ieee_content = requests.get("http://ieeexplore.ieee.org/document/8465981", timeout=180) soup = BeautifulSoup(ieee_content.text, 'xml') tag = soup.find_all('script') # i is a list for i in tag[9]: metadata_format = re.compile(r'global.document.metadata=.*', re.MULTILINE) metadata = re.findall(metadata_format, i) if len(metadata) != 0: # convert the list convert_to_json = json.dumps(metadata) x = json.loads(convert_to_json) s = x[0].replace("'", '"').replace(";", '') ########################################### # I don't know what you plan to do with 's' ########################################### print (s)
Apparently in line 65 some of the data provided in i did not suite the regex pattern you're trying to use. Therefor your [0] will not work as the data returned is not an array of suitable length. Solution: x = json.loads(re.findall('global.document.metadata=(.*;)', i) if x: s = x[0].replace("'", '"').replace(";", ''))
YouTube API search list_next() throws UnicodeEncodeError
When I feed a non-English string into the YouTube API library's search, it only works during the initial search. If I call list_next(), it throws a UnicodeEncodeError. When I use a simple ascii string, everything works correctly. Any suggestions about what I should do? Here's a simplified code of what I'm doing: # -*- coding: utf-8 -*- import apiclient.discovery def test(query): youtube = apiclient.discovery.build('youtube', 'v3', developerKey='xxx') ys = youtube.search() req = ys.list( q=query.encode('utf-8'), type='video', part='id,snippet', maxResults=50 ) while (req): res = req.execute() for i in res['items']: print(i['id']['videoId']) req = ys.list_next(req, res) test(u'한글') test(u'日本語') test(u'\uD55C\uAE00') test(u'\u65E5\u672C\u8A9E') Error message: Traceback (most recent call last): File "E:\prj\scripts\yt\search.py", line 316, in _search req = ys.list_next(req, res) File "D:\Apps\Python\lib\site-packages\googleapiclient\discovery.py", line 966, in methodNext parsed[4] = urlencode(newq) File "D:\Apps\Python\lib\urllib.py", line 1343, in urlencode v = quote_plus(str(v)) UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-9: ordinal not in range(128) Versions: google-api-python-client (1.6.2) Python 2.7.13 (Win32) EDIT: I posted a workaround below.
If anyone else is interested, here's one workaround that works for me: googleapiclient/discovery.py: (old) q = parse_qsl(parsed[4]) (new) q = parse_qsl(parsed[4].encode('ascii')) Explanation In discovery.py, list_next() parses and unescapes the previous url, then makes a new url from it: pageToken = previous_response['nextPageToken'] parsed = list(urlparse(request.uri)) q = parse_qsl(parsed[4]) # Find and remove old 'pageToken' value from URI newq = [(key, value) for (key, value) in q if key != 'pageToken'] newq.append(('pageToken', pageToken)) parsed[4] = urlencode(newq) uri = urlunparse(parsed) It seems the problem is when parse_qsl unescapes the unicode parsed[4], it returns the utf-8 encoded value in a unicode type. urlencode does not like this: q = urlparse.parse_qsl(u'q=%ED%95%9C%EA%B8%80') [(u'q', u'\xed\x95\x9c\xea\xb8\x80')] urllib.urlencode(q) UnicodeEncodeError If parse_qsl is given a plain ascii string, it returns a plain utf-8 encoded string which urlencode likes: q = urlparse.parse_qsl(u'q=%ED%95%9C%EA%B8%80'.encode('ascii')) [('q', '\xed\x95\x9c\xea\xb8\x80')] urllib.urlencode(q) 'q=%ED%95%9C%EA%B8%80'
How do I read this stringified javascript variable into Python?
I'm trying to read _pageData from https://www.simpliowebstudio.com/wp-content/uploads/2014/07/aWfyh1 into Python 2.7.11 so that I can process it, using this code: #!/usr/bin/env python # -*- coding: utf-8 -*- """ Testing _pageData processing. """ import urllib2 import re import ast import json import yaml BASE_URL = 'https://www.simpliowebstudio.com/wp-content/uploads/2014/07/aWfyh1' def main(): """ Do the business. """ response = urllib2.urlopen(BASE_URL, None) results = re.findall('var _pageData = \\"(.*?)\\";</script>', response.read()) first_result = results[0] # These all fail data = ast.literal_eval(first_result) # data = yaml.load(first_result) # data = json.loads(first_result) if __name__ == '__main__': main() but get the following error: Traceback (most recent call last): File "./temp.py", line 24, in <module> main() File "./temp.py", line 19, in main data = ast.literal_eval(first_result) File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/ast.py", line 49, in literal_eval node_or_string = parse(node_or_string, mode='eval') File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/ast.py", line 37, in parse return compile(source, filename, mode, PyCF_ONLY_AST) File "<unknown>", line 1 [[1,true,true,true,true,true,true,true,true,,\"at\",\"\",\"\",1450364255674,\"\",\"en_US\",false,[]\n,\"https://www.google.com/maps/d/viewer?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/embed?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/edit?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/thumbnail?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",,,true,\"https://www.google.com/maps/d/print?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/pdf?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/viewer?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",false,false,\"/maps/d\",\"maps/sharing\",\"//www.google.com/intl/en_US/help/terms_maps.html\",true,\"https://docs.google.com/picker\",[]\n,false,true,[[[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-regular-001.png\",143,25]\n,[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-regular-2x-001.png\",286,50]\n]\n,[[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-small-001.png\",113,20]\n,[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-small-2x-001.png\",226,40]\n]\n]\n,1,\"https://www.gstatic.com/mapspro/_/js/k\\u003dmapspro.gmeviewer.en_US.8b9lQX3ifcs.O/m\\u003dgmeviewer_base/rt\\u003dj/d\\u003d0/rs\\u003dABjfnFWonctWGGtD63MaO3UZxCxF6UPKJQ\",true,true,false,true,\"US\",false,true,true,5,false]\n,[\"mf.map\",\"zBghbRiSwHlg.k2ATNtn6BCk0\",\"Hollywood, FL\",\"\",[-80.16005,26.01043,-80.16005,26.01043]\n,[-80.16005,26.01043,-80.16005,26.01043]\n,[[,\"zBghbRiSwHlg.kq4rrF9BNRIg\",\"Untitled layer\",\"\",[[[\"https://mt.googleapis.com/vt/icon/name\\u003dicons/onion/22-blue-dot.png\\u0026scale\\u003d1.0\"]\n,[]\n,1,1,[[,[26.01043,-80.16005]\n]\n,\"MDZBMzJCQjRBOTAwMDAwMQ~CjISKmdlby1tYXBzcHJvLm1hcHNob3AtbGF5ZXItNDUyOWUwMTc0YzhkNmI2ZBgAKAAwABIZACBawIJBU4Fe8v7vNSoAg0dtnhhVotEBLg\",\"vdb:\",\"zBghbRiSwHlg.kq4rrF9BNRIg\",[26.01043,-80.16005]\n,[0,-32]\n,\"06A32BB4A9000001\"]\n,[[\"Hollywood, FL\"]\n]\n,[]\n]\n]\n,,1.0,true,true,,,,[[\"zBghbRiSwHlg.kq4rrF9BNRIg\",1,,,,\"https://mapsengine.google.com/map/kml?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\\u0026lid\\u003dzBghbRiSwHlg.kq4rrF9BNRIg\",,,,,0,2,true,[[[\"06A32BB4A9000001\",[[[26.01043,-80.16005]\n]\n]\n,[]\n,[]\n,0,[[\"name\",[\"Hollywood, FL\"]\n,1]\n,,[]\n,[]\n]\n,,0]\n]\n,[[[\"https://mt.googleapis.com/vt/icon/name\\u003dicons/onion/22-blue-dot.png\\u0026filter\\u003dff\\u0026scale\\u003d1.0\",[16,32]\n,1.0]\n,[[\"0000FF\",0.45098039215686275]\n,5000]\n,[[\"0000FF\",0.45098039215686275]\n,[\"000000\",0.25098039215686274]\n,3000]\n]\n]\n]\n]\n]\n,[]\n,,,,,1]\n]\n,[2]\n,,,\"mapspro\",\"zBghbRiSwHlg.k2ATNtn6BCk0\",,true,false,false,\"\",2,false,\"https://mapsengine.google.com/map/kml?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",3807]\n]\n ^ SyntaxError: invalid syntax var _pageData is in this format: "[[1,true,true,true,true,true,true,true,true,,\"at\",\"\",\"\",1450364255674,\"\",\"en_US\",false,[]\n,\"https://www.google.com/maps/d/viewer?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/embed?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/edit?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/thumbnail?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",,,true,\"https://www.google.com/maps/d/print?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/pdf?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/viewer?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",false,false,\"/maps/d\",\"maps/sharing\",\"//www.google.com/intl/en_US/help/terms_maps.html\",true,\"https://docs.google.com/picker\",[]\n,false,true,[[[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-regular-001.png\",143,25]\n,[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-regular-2x-001.png\",286,50]\n]\n,[[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-small-001.png\",113,20]\n,[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-small-2x-001.png\",226,40]\n]\n]\n,1,\"https://www.gstatic.com/mapspro/_/js/k\\u003dmapspro.gmeviewer.en_US.8b9lQX3ifcs.O/m\\u003dgmeviewer_base/rt\\u003dj/d\\u003d0/rs\\u003dABjfnFWonctWGGtD63MaO3UZxCxF6UPKJQ\",true,true,false,true,\"US\",false,true,true,5,false]\n,[\"mf.map\",\"zBghbRiSwHlg.k2ATNtn6BCk0\",\"Hollywood, FL\",\"\",[-80.16005,26.01043,-80.16005,26.01043]\n,[-80.16005,26.01043,-80.16005,26.01043]\n,[[,\"zBghbRiSwHlg.kq4rrF9BNRIg\",\"Untitled layer\",\"\",[[[\"https://mt.googleapis.com/vt/icon/name\\u003dicons/onion/22-blue-dot.png\\u0026scale\\u003d1.0\"]\n,[]\n,1,1,[[,[26.01043,-80.16005]\n]\n,\"MDZBMzJCQjRBOTAwMDAwMQ~CjISKmdlby1tYXBzcHJvLm1hcHNob3AtbGF5ZXItNDUyOWUwMTc0YzhkNmI2ZBgAKAAwABIZACBawIJBU4Fe8v7vNSoAg0dtnhhVotEBLg\",\"vdb:\",\"zBghbRiSwHlg.kq4rrF9BNRIg\",[26.01043,-80.16005]\n,[0,-32]\n,\"06A32BB4A9000001\"]\n,[[\"Hollywood, FL\"]\n]\n,[]\n]\n]\n,,1.0,true,true,,,,[[\"zBghbRiSwHlg.kq4rrF9BNRIg\",1,,,,\"https://mapsengine.google.com/map/kml?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\\u0026lid\\u003dzBghbRiSwHlg.kq4rrF9BNRIg\",,,,,0,2,true,[[[\"06A32BB4A9000001\",[[[26.01043,-80.16005]\n]\n]\n,[]\n,[]\n,0,[[\"name\",[\"Hollywood, FL\"]\n,1]\n,,[]\n,[]\n]\n,,0]\n]\n,[[[\"https://mt.googleapis.com/vt/icon/name\\u003dicons/onion/22-blue-dot.png\\u0026filter\\u003dff\\u0026scale\\u003d1.0\",[16,32]\n,1.0]\n,[[\"0000FF\",0.45098039215686275]\n,5000]\n,[[\"0000FF\",0.45098039215686275]\n,[\"000000\",0.25098039215686274]\n,3000]\n]\n]\n]\n]\n]\n,[]\n,,,,,1]\n]\n,[2]\n,,,\"mapspro\",\"zBghbRiSwHlg.k2ATNtn6BCk0\",,true,false,false,\"\",2,false,\"https://mapsengine.google.com/map/kml?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",3807]\n]\n" I've tried replacing the \" and \n and decoding the \uXXXX before using, without success. I've also tried replacing ,, with ,"", and ,'', without success. Thank you.
It seems like there are three kinds of syntactic errors in your string: , followed by , [ followed by , , followed by ] Assuming that those are supposed to be null elements (or ''?), you can just replace those in the original string -- exactly like you did for the ,, case, but you missed the others. Also, you have to do the ,, replacement twice, otherwise you will miss cases such as ,,,,. Then, you can load the JSON string with json.loads. >>> s = "your messed up json string" >>> s = re.sub(r",\s*,", ", null,", s) >>> s = re.sub(r",\s*,", ", null,", s) >>> s = re.sub(r"\[\s*,", "[ null,", s) >>> s = re.sub(r",\s*\]", ", null]", s) >>> json.loads(s)
I started off using ast.literal.eval(...) because I was under the (mistaken?) impression that javascript arrays and Python lists were mutually compatible, so all I had to do was destringify _pageData. However, I hadn't noticed that Python doesn't like ,, true, false or [,. Fixing them does the trick (thank you #Two-Bit Alchemist and #tobias_k) So, the following appears to work: #!/usr/bin/env python # -*- coding: utf-8 -*- """ Testing _pageData processing. """ import urllib2 import re import ast import json import yaml BASE_URL = 'https://www.simpliowebstudio.com/wp-content/uploads/2014/07/aWfyh1' def main(): """ Do the business. """ response = urllib2.urlopen(BASE_URL, None) results = re.findall('var _pageData = \\"(.*?)\\";</script>', response.read()) first_result = results[0] first_result = first_result.replace(',,,,,,', ',None,None,None,None,None,') first_result = first_result.replace(',,,,,', ',None,None,None,None,') first_result = first_result.replace(',,,,', ',None,None,None,') first_result = first_result.replace(',,,', ',None,None,') first_result = first_result.replace(',,', ',None,') first_result = first_result.replace('[,', '[None,') first_result = first_result.replace('\\"', '\'') first_result = first_result.replace('\\n', '') first_result = first_result.replace('true', 'True') first_result = first_result.replace('false', 'False') data = ast.literal_eval(first_result) for entry in data: print entry if __name__ == '__main__': main()
local variable 'texts' referenced before assignment
This a code to extract Unicode values from text files but it gives me following error. # -*- coding: utf-8 -*- import codecs import os #from urllib import urlopen from bs4 import BeautifulSoup import re ##import nltk #def remove_content_li(input_document) : #soup = BeautifulSoup(input_document) def extract_unicode(input): _ascii_letters = re.compile(r'[a-zA-Z]', flags=re.UNICODE) symbols = re.compile(r'[{} &+( )" =!.?.:.. / | » © : >< # « ,] 1 2 3 4 5 6 7 8 9 _ - + ; [ ] %',flags=re.UNICODE) soup = BeautifulSoup(open(input,'r'),'lxml') for li in soup.find_all('li'): li.decompose() texts = soup.findAll(text=True) def contains_unicode(text): try: str(text) except: return True return False result = ' '.join((text for text in texts if contains_unicode(texts))) result =_ascii_letters.sub(" ", result) result = symbols.sub(" ",result) ##print(result) ## result = nltk.clean_html(result) result.replace('*', '') This is the error I get File "e3.py", line 50, in <module> extract_unicode((os.path.join(dirname, filename))) File "e3.py", line 30, in extract_unicode result = ' '.join((text for text in texts if contains_unicode(texts))) UnboundLocalError: local variable 'texts' referenced before assignment
The error is telling you exactly what the problem is. You're using a variable texts before you define it. Perhaps soup.find_all('li') is returning an empty list, since you only set texts if it finds something.