Error in WIth loop in Python - python

I have a code in Python to preprocess some text and wrote it into a file.
It removes hashtags, username, symbols and links, stopwords and also gets root of word
import tweepy
import time
import os
import sys
import json
import argparse
import re
from collections import defaultdict
import glob
from nltk.stem.snowball import SnowballStemmer
text = "shit.txt"
def process_text(text=text):
text=re.sub('\\B#[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\B#[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\B$[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\bRT\\b','',text)
text = text.lower()
text = re.sub("(https?://[^ ]+)",'',text)
if text:
a1 = [line.split("-")[0] for line in file("ListOfShortWords.txt")]
a2 = [re.sub("\n",'',line.split("-")[1]).encode("utf-8")for line in file("ListOfShortWords.txt")]
HashList = defaultdict(lambda:"nil")
for c in range(0,len(a1)):
HashList[a1[c]] = a2[c]
text = re.sub(r'([aeiou])\1{2,}', r'\1', text)
text = re.sub(r'([^aeiou])\1{2,}', r'\1\1',text)
text = re.sub(r'(.)\1{2,}\\b', r'\1', text)
for key in HashList.keys():
text = re.sub("\\b"+str(key)+"\\b",str(HashList[key]),text)
for stopword in ['about','above','after','ain\'t','aint','again','against','all','am','an','and','any','are','as','at','be','because','been','before','being','below','between','both','but','by','could','did','do','does','doing','down','during','each','few','for','from','further','had','has','have','having','he','he\'d','he\'ll','he\'s''here''here\'s''hers''herself''him''himself','her','his','how','how\'s','i','i\'d','i\'ll','i\'m','i\'ve','ive','if','in','into','is','it','it\'s','its','itself','let\'s','lets','me','more','most','my','myself','no','nor','not','of','off','on','once','only','or','other','ought','our','ours','ourselves','out','over','own','same','she','she\'d','she\'ll','she\'s','shes','should','so','some','such','than','that','that\'s','thats','the','their','theirs','them','themselves','then','there','there\'s','theres','these','they','they\'d','theyd','they\'ll','they\'re','they\'ve','theyll','theyre','theyve','this','those','through','to','too','under','until','up','very','was','we','we\'d','we\'ll','we\'re','we\'ve','were','what','what\'s','whats','when','when\'s','whens','where','where\'s','wheres','which','while','who','who\'s','whos','whom','why','why\'s','whys','with','won\'t','wont','would','you','you\'d','youd','you\'ll','youll','you\'re','you\'ve','youre','youve','your','yours','yourself','yourselves','\'tis','\'twas','tis','twas']:
text = re.sub("\\b"+stopword+"\\b",'',text)
for ch in ['&','$',',','.','/',':',';','"','{','[','}',']','|','\\','+','=','-','_',')','(','*','^','%','!','~','`','?']:
text = text.replace(ch,' ')
text = re.sub("\\b[0-9]*\\b",'',text)
text = text.replace('\'','')
text = re.sub('\\b[a-z]\\b','',text)
text = re.sub(r'[^\x00-\x7F]+',' ',text)
text = ' '.join(text.split())
return text
for pp in ['pos','neg','neu','irr']:
a = 1
for fil in glob.glob("Senti/"+str(pp)+"/*.txt"):
for line in file(fil):
t = process_text(text=line)
realline=''
for word in t.split():
realline = realline+" "+str(SnowballStemmer("english").stem(word)
with open("Processed Senti/"+str(pp)+"/"+str(a)+".txt", 'w') as outf:
outf.write(realline)
a=a+1
I get an error saying
with open("Processed Senti/"+str(pp)+"/"+str(a)+".txt", 'w') as outf:
^
SyntaxError: invalid syntax
What is wrong with the code? All required folders and files exist

There is a ) missing in the previous line... The str() function is not closed correctly.

Related

All text is saved in one line

So, I was trying to use NLTK from Python to do a part of speech tagging to a text file.
This is the code I used
import nltk
from nltk import word_tokenize, pos_tag
f = open('all.txt')
raw = f.read()
text = word_tokenize(raw)
paosted = nltk.pos_tag(text)
saveFile = open('ol.txt', 'w')
saveFile.write(str(paosted))
saveFile.close()
The code did work, but the problem is that it saved all the text in one single line as shown in the attached picture. as shown here .. I know I should be using a "\n" function, but I am a novice in python and have no idea how to do it, so any help would be appreciated :) ..
-------- UPDATE -----------
WELL, People have been really helpful and offered some solutions i.e., this code:
import nltk
from nltk import word_tokenize, pos_tag
f = open('all.txt')
raw = f.read()
text = word_tokenize(raw)
paosted = nltk.pos_tag(text)
saveFile.write(str(paosted).replace('),' , '),\n'))
saveFile.close()
But I still need to have it in the form of a paragraph because I am going to use it latter in a concordance software. Please have a look at this screenshot:
https://i.stack.imgur.com/tU1NW.png
paosted is a list of tuple you can iterate over it and write each tuple to a line
Ex:
paosted = nltk.pos_tag(text)
saveFile = open('ol.txt', 'w')
for line in paosted:
saveFile.write(str(line)+ "\n")
saveFile.close()
Updating my answer accordingly to,
temp = []
for i in paosted:
temp.append("_".join(i))
" ".join(temp)
Thank you all! I followed some of your instructions and the best result I got was with this code:
import nltk
from nltk import word_tokenize, pos_tag
f = open('all.txt')
raw = f.read()
text = word_tokenize(raw)
paosted = nltk.pos_tag(text)
saveFile = open('output.txt', 'w')
saveFile.write(str(paosted).replace("('.', '.')" , "\n"))
saveFile.close()

Error with " blob = TextBlob(tweet[text])" in sentiment analysis using Python and Textblob

I'm working on a project in which I extract tweets from Twitter and run a sentiment analysis on specific keywords to draw conclusions. Unfortunately, I have come to a point where I am stumped. I have a sentiment analysis code:
When I use this: blob = TextBlob(tweet[text]) I get the following error:
Traceback (most recent call last): File
"C:/Users/Michael/python/Sentiment2.py", line 65, in
blob = TextBlob(tweet[text]) NameError: name 'text' is not defined
import json
import re
import operator
from textblob import TextBlob
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import os, sys, codecs
import csv
import sys
from nltk import bigrams
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:#[\w_]+)', # #-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
fname = 'python.json'
with open(fname, 'r') as f:
lis=[]
neg=0.0
n=0.0
net=0.0
pos=0.0
p=0.0
count_all = Counter()
cout=0
for line in f:
try:
tweet = json.loads(line)
except:
continue
# Create a list with all the terms
blob = TextBlob(tweet[text])
cout+=1
lis.append(blob.sentiment.polarity)
#print blob.sentiment.subjectivity
#print (os.listdir(tweet["text"]))
if blob.sentiment.polarity < 0:
sentiment = "negative"
neg+=blob.sentiment.polarity
n+=1
elif blob.sentiment.polarity == 0:
sentiment = "neutral"
net+=1
else:
sentiment = "positive"
pos+=blob.sentiment.polarity
p+=1
# output sentiment
print("Total tweets"),len(lis)
print("Positive"),float(p/cout)*100,"%"
print("Negative"),float(n/cout)*100,"%"
print("Neutral"),float(net/len(lis))*100,"%"
#print lis
# determine if sentiment is positive, negative, or neutral
# output sentiment
#print sentiment
Change this
# Create a list with all the terms
blob = TextBlob(tweet[text])
to
# Create a list with all the terms
blob = TextBlob(tweet['text'])

Import Error: No Module named 'xxxx' Specific

Basically what I'm trying to do is make a program in python which takes a URL, copys the source, and pulls all comments out and presents them to the user.
import urllib2
import html2text
import PullsCommentsOut.pullscommentsout
url = raw_input('Please input URL with the text you want to analyze: ')
page = urllib2.urlopen(url)
html_content = page.read().decode('utf8')
rendered_content = html2text.html2text(html_content).encode('ascii',
'ignore')
f = open('file_text.txt', 'wb')
f.write(rendered_content)
f.close()
result = PullsCommentsOut.pullscommentsout(html_content)
print result
And my second file, 'PullsCommentsOut'
import re
def pullscommentsout():
def comment_remover(text):
def replacer(match):
s = match.group(0)
if s.startswith('/'):
print s
return " " # note: a space and not an empty string
else:
return s
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE
)
return re.sub(pattern, replacer, text)
fd = open("test.c", "r")
buf = fd.read()
comment_remover(buf)
For the life of me I can't figure out why Python doesn't think I'm not importing the proper module? It doesn't make sense.
I need to add more text so it allows me to post, so, how are you all doing? I'm doing pretty good I guess. No complaints.

How to find unique words for each text file in a bundle of text files using python?

How can I find only words that are unique to a text file? If a word is used frequently by in other files then it gets dropped.
Here is a reference http://sahandsaba.com/visualizing-philosophers-and-scientists-by-the-words-they-used-with-d3js-and-python.html
I need a script which loops through all text files in a folder and outputs the results in Json format.
My code so far :
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from pprint import pprint as pp
from glob import glob
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
import codecs
import jinja2
import json
import os
def get_raw_data():
texts = []
for x in range(1,95):
file_name = str(x+1)+".txt"
with codecs.open(file_name,"rU","utf-8") as myfile:
data = myfile.read()
texts.append(data)
yield file_name, '\n'.join(texts)
class StemTokenizer(object):
def __init__(self):
self.ignore_set = {'footnote'}
def __call__(self, doc):
words = []
for word in word_tokenize(doc):
word = word.lower()
w = wn.morphy(word)
if w and len(w) > 1 and w not in self.ignore_set:
words.append(w)
return words
def process_text(counts, vectorizer, text, file_name, index):
result = {w: counts[index][vectorizer.vocabulary_.get(w)]
for w in vectorizer.get_feature_names()}
result = {w: c for w, c in result.iteritems() if c > 4}
normalizing_factor = max(c for c in result.itervalues())
result = {w: c / normalizing_factor
for w, c in result.iteritems()}
return result
def main():
data = list(get_raw_data())
print('Data loaded')
n = len(data)
vectorizer = CountVectorizer(stop_words='english', min_df=(n-1) / n,tokenizer=StemTokenizer())
counts = vectorizer.fit_transform(text for p, text in data).toarray()
print('Vectorization done.')
print (counts)
for x in range(95):
file_name = str(x+1)+".txt"
# print (text)
for i, (text) in enumerate(data):
print (file_name)
# print (text)
with codecs.open(file_name,"rU","utf-8") as myfile:
text = myfile.read()
result = process_text(counts, vectorizer, text, file_name, i)
print (result)
if __name__ == '__main__':
main()
Looks like you've got a bunch of files named 1.txt, 2.txt, ... 95.txt, and you want to find words that occur in one file only. I'd just gather all words, counting how many files each one occurs in; and print out the singletons.
from collections import Counter
import re
fileids = [ str(n+1)+".txt" for n in range(95) ]
filecounts = Counter()
for fname in fileids:
with open(fname) as fp: # Add encoding if really needed
text = fp.read().lower()
words = re.split(r"\W+", text) # Keep letters, drop the rest
filecounts.update(set(words))
singletons = [ word in filecounts if filecounts[word] == 1 ]
print(" ".join(singletons))
Done. You don't need scikit, you don't need the nltk, you don't need a pile of IR algorithms. You can use the list of singletons in an IR algorithm, but that's a different story.
def parseText():
# oFile: text file to test
# myWord: word we are looking for
# Get all lines into list
aLines = oFile.readlines()
# Perform list comprehension on lines to test if the word is found
for sLine in aLines:
# Parse the line (remove spaces), returns list
aLine = sLine.split()
# Iterate words and test to see if they match our word
for sWord in aLines:
# if it matches, append it to our list
if sWord == myWord: aWords.append( sWord )
# Create empty list to store all instances of the word that we may find
aWords = []
# Prompt user to know what word to search
myWord = str( raw_input( 'what word to searh:' ) )
# Call function
parseText()
# Check if list has at least one element
if len( aWords ) < 1: print 'Word not found in file'
else: print str( len( aWords ) ) + ' instances of our word found in file'

cleaning text files in python 2 : TypeError: coercing to Unicode:

I am trying to clean up so text files in python. I want to take out stop words, digits and the new line character. But I keep getting coercing to Unicode python text . Here is my code:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from string import digits
def cleanupDoc(s):
s = s.translate(None,digits)
s = s.rstrip('\n')
stopset = set(stopwords.words('english'))
tokens = nltk.word_tokenize(s)
cleanup = " ".join(filter(lambda word: word not in stopset, s.split()))
return cleanup
flist=glob.glob('/home/uiucinfo/Desktop/*txt')
mylist=[]
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
#line = cleanupDoc(line)
mylist.append(line)
for fdoc in mylist:
doc = open(fdoc)
newDoc = cleanupDoc(doc)
doc.close()
My Error
Traceback (most recent call last):
File "<stdin>", line 3, in <module>
TypeError: coercing to Unicode: need string or buffer, list found
tfile.readlines() gives you a list of lines, which you are appending to another list:
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
mylist.append(line)
In result, you have a list of lists in mylist.
The following should fix the problem:
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
mylist += line
This will give you a list of strings in mylist.
import nltk
form nltk import word_tokenize
from nltk.corpus import stopwords
#nltk.download()
import string
from string import digits
import glob
import re
def cleanupDoc(s):
#s = s.translate(None,digits)
#s = s.rstrip('\n')
stopset = set(stopwords.words('english'))
tokens = nltk.word_tokenize(s)
cleanup = " ".join(filter(lambda word: word not in stopset, s.split()))
return cleanup
flist=glob.glob('/home/uiucinfo/Desktop/*txt')
mylist=[]
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
#line = cleanupDoc(line)
mylist.append(line)
for fdoc in mylist:
# remove \n or digit from fdoc
fdoc = [re.sub(r'[\"\n]|\d', '', x) for x in fdoc]
# convert list to string
fdoc = ''.join(fdoc)
print fdoc
newDoc = cleanupDoc(fdoc)
print " newDoc: " , newDoc

Categories

Resources