Error in WIth loop in Python - python
I have a code in Python to preprocess some text and wrote it into a file.
It removes hashtags, username, symbols and links, stopwords and also gets root of word
import tweepy
import time
import os
import sys
import json
import argparse
import re
from collections import defaultdict
import glob
from nltk.stem.snowball import SnowballStemmer
text = "shit.txt"
def process_text(text=text):
text=re.sub('\\B#[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\B#[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\B$[a-zA-Z0-9_]*\\b','',text)
text=re.sub('\\bRT\\b','',text)
text = text.lower()
text = re.sub("(https?://[^ ]+)",'',text)
if text:
a1 = [line.split("-")[0] for line in file("ListOfShortWords.txt")]
a2 = [re.sub("\n",'',line.split("-")[1]).encode("utf-8")for line in file("ListOfShortWords.txt")]
HashList = defaultdict(lambda:"nil")
for c in range(0,len(a1)):
HashList[a1[c]] = a2[c]
text = re.sub(r'([aeiou])\1{2,}', r'\1', text)
text = re.sub(r'([^aeiou])\1{2,}', r'\1\1',text)
text = re.sub(r'(.)\1{2,}\\b', r'\1', text)
for key in HashList.keys():
text = re.sub("\\b"+str(key)+"\\b",str(HashList[key]),text)
for stopword in ['about','above','after','ain\'t','aint','again','against','all','am','an','and','any','are','as','at','be','because','been','before','being','below','between','both','but','by','could','did','do','does','doing','down','during','each','few','for','from','further','had','has','have','having','he','he\'d','he\'ll','he\'s''here''here\'s''hers''herself''him''himself','her','his','how','how\'s','i','i\'d','i\'ll','i\'m','i\'ve','ive','if','in','into','is','it','it\'s','its','itself','let\'s','lets','me','more','most','my','myself','no','nor','not','of','off','on','once','only','or','other','ought','our','ours','ourselves','out','over','own','same','she','she\'d','she\'ll','she\'s','shes','should','so','some','such','than','that','that\'s','thats','the','their','theirs','them','themselves','then','there','there\'s','theres','these','they','they\'d','theyd','they\'ll','they\'re','they\'ve','theyll','theyre','theyve','this','those','through','to','too','under','until','up','very','was','we','we\'d','we\'ll','we\'re','we\'ve','were','what','what\'s','whats','when','when\'s','whens','where','where\'s','wheres','which','while','who','who\'s','whos','whom','why','why\'s','whys','with','won\'t','wont','would','you','you\'d','youd','you\'ll','youll','you\'re','you\'ve','youre','youve','your','yours','yourself','yourselves','\'tis','\'twas','tis','twas']:
text = re.sub("\\b"+stopword+"\\b",'',text)
for ch in ['&','$',',','.','/',':',';','"','{','[','}',']','|','\\','+','=','-','_',')','(','*','^','%','!','~','`','?']:
text = text.replace(ch,' ')
text = re.sub("\\b[0-9]*\\b",'',text)
text = text.replace('\'','')
text = re.sub('\\b[a-z]\\b','',text)
text = re.sub(r'[^\x00-\x7F]+',' ',text)
text = ' '.join(text.split())
return text
for pp in ['pos','neg','neu','irr']:
a = 1
for fil in glob.glob("Senti/"+str(pp)+"/*.txt"):
for line in file(fil):
t = process_text(text=line)
realline=''
for word in t.split():
realline = realline+" "+str(SnowballStemmer("english").stem(word)
with open("Processed Senti/"+str(pp)+"/"+str(a)+".txt", 'w') as outf:
outf.write(realline)
a=a+1
I get an error saying
with open("Processed Senti/"+str(pp)+"/"+str(a)+".txt", 'w') as outf:
^
SyntaxError: invalid syntax
What is wrong with the code? All required folders and files exist
There is a ) missing in the previous line... The str() function is not closed correctly.
Related
All text is saved in one line
So, I was trying to use NLTK from Python to do a part of speech tagging to a text file. This is the code I used import nltk from nltk import word_tokenize, pos_tag f = open('all.txt') raw = f.read() text = word_tokenize(raw) paosted = nltk.pos_tag(text) saveFile = open('ol.txt', 'w') saveFile.write(str(paosted)) saveFile.close() The code did work, but the problem is that it saved all the text in one single line as shown in the attached picture. as shown here .. I know I should be using a "\n" function, but I am a novice in python and have no idea how to do it, so any help would be appreciated :) .. -------- UPDATE ----------- WELL, People have been really helpful and offered some solutions i.e., this code: import nltk from nltk import word_tokenize, pos_tag f = open('all.txt') raw = f.read() text = word_tokenize(raw) paosted = nltk.pos_tag(text) saveFile.write(str(paosted).replace('),' , '),\n')) saveFile.close() But I still need to have it in the form of a paragraph because I am going to use it latter in a concordance software. Please have a look at this screenshot: https://i.stack.imgur.com/tU1NW.png
paosted is a list of tuple you can iterate over it and write each tuple to a line Ex: paosted = nltk.pos_tag(text) saveFile = open('ol.txt', 'w') for line in paosted: saveFile.write(str(line)+ "\n") saveFile.close()
Updating my answer accordingly to, temp = [] for i in paosted: temp.append("_".join(i)) " ".join(temp)
Thank you all! I followed some of your instructions and the best result I got was with this code: import nltk from nltk import word_tokenize, pos_tag f = open('all.txt') raw = f.read() text = word_tokenize(raw) paosted = nltk.pos_tag(text) saveFile = open('output.txt', 'w') saveFile.write(str(paosted).replace("('.', '.')" , "\n")) saveFile.close()
Error with " blob = TextBlob(tweet[text])" in sentiment analysis using Python and Textblob
I'm working on a project in which I extract tweets from Twitter and run a sentiment analysis on specific keywords to draw conclusions. Unfortunately, I have come to a point where I am stumped. I have a sentiment analysis code: When I use this: blob = TextBlob(tweet[text]) I get the following error: Traceback (most recent call last): File "C:/Users/Michael/python/Sentiment2.py", line 65, in blob = TextBlob(tweet[text]) NameError: name 'text' is not defined import json import re import operator from textblob import TextBlob from collections import Counter import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import string import os, sys, codecs import csv import sys from nltk import bigrams emoticons_str = r""" (?: [:=;] # Eyes [oO\-]? # Nose (optional) [D\)\]\(\]/\\OpP] # Mouth )""" regex_str = [ emoticons_str, r'<[^>]+>', # HTML tags r'(?:#[\w_]+)', # #-mentions r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags r'http[s]?://(?:[a-z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and ' r'(?:[\w_]+)', # other words r'(?:\S)' # anything else ] tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE) emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE) def tokenize(s): return tokens_re.findall(s) def preprocess(s, lowercase=False): tokens = tokenize(s) if lowercase: tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens] return tokens punctuation = list(string.punctuation) stop = stopwords.words('english') + punctuation + ['rt', 'via'] fname = 'python.json' with open(fname, 'r') as f: lis=[] neg=0.0 n=0.0 net=0.0 pos=0.0 p=0.0 count_all = Counter() cout=0 for line in f: try: tweet = json.loads(line) except: continue # Create a list with all the terms blob = TextBlob(tweet[text]) cout+=1 lis.append(blob.sentiment.polarity) #print blob.sentiment.subjectivity #print (os.listdir(tweet["text"])) if blob.sentiment.polarity < 0: sentiment = "negative" neg+=blob.sentiment.polarity n+=1 elif blob.sentiment.polarity == 0: sentiment = "neutral" net+=1 else: sentiment = "positive" pos+=blob.sentiment.polarity p+=1 # output sentiment print("Total tweets"),len(lis) print("Positive"),float(p/cout)*100,"%" print("Negative"),float(n/cout)*100,"%" print("Neutral"),float(net/len(lis))*100,"%" #print lis # determine if sentiment is positive, negative, or neutral # output sentiment #print sentiment
Change this # Create a list with all the terms blob = TextBlob(tweet[text]) to # Create a list with all the terms blob = TextBlob(tweet['text'])
Import Error: No Module named 'xxxx' Specific
Basically what I'm trying to do is make a program in python which takes a URL, copys the source, and pulls all comments out and presents them to the user. import urllib2 import html2text import PullsCommentsOut.pullscommentsout url = raw_input('Please input URL with the text you want to analyze: ') page = urllib2.urlopen(url) html_content = page.read().decode('utf8') rendered_content = html2text.html2text(html_content).encode('ascii', 'ignore') f = open('file_text.txt', 'wb') f.write(rendered_content) f.close() result = PullsCommentsOut.pullscommentsout(html_content) print result And my second file, 'PullsCommentsOut' import re def pullscommentsout(): def comment_remover(text): def replacer(match): s = match.group(0) if s.startswith('/'): print s return " " # note: a space and not an empty string else: return s pattern = re.compile( r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', re.DOTALL | re.MULTILINE ) return re.sub(pattern, replacer, text) fd = open("test.c", "r") buf = fd.read() comment_remover(buf) For the life of me I can't figure out why Python doesn't think I'm not importing the proper module? It doesn't make sense. I need to add more text so it allows me to post, so, how are you all doing? I'm doing pretty good I guess. No complaints.
How to find unique words for each text file in a bundle of text files using python?
How can I find only words that are unique to a text file? If a word is used frequently by in other files then it gets dropped. Here is a reference http://sahandsaba.com/visualizing-philosophers-and-scientists-by-the-words-they-used-with-d3js-and-python.html I need a script which loops through all text files in a folder and outputs the results in Json format. My code so far : from __future__ import division from __future__ import print_function from __future__ import unicode_literals from pprint import pprint as pp from glob import glob from nltk import word_tokenize from nltk.corpus import wordnet as wn from sklearn.feature_extraction.text import CountVectorizer import codecs import jinja2 import json import os def get_raw_data(): texts = [] for x in range(1,95): file_name = str(x+1)+".txt" with codecs.open(file_name,"rU","utf-8") as myfile: data = myfile.read() texts.append(data) yield file_name, '\n'.join(texts) class StemTokenizer(object): def __init__(self): self.ignore_set = {'footnote'} def __call__(self, doc): words = [] for word in word_tokenize(doc): word = word.lower() w = wn.morphy(word) if w and len(w) > 1 and w not in self.ignore_set: words.append(w) return words def process_text(counts, vectorizer, text, file_name, index): result = {w: counts[index][vectorizer.vocabulary_.get(w)] for w in vectorizer.get_feature_names()} result = {w: c for w, c in result.iteritems() if c > 4} normalizing_factor = max(c for c in result.itervalues()) result = {w: c / normalizing_factor for w, c in result.iteritems()} return result def main(): data = list(get_raw_data()) print('Data loaded') n = len(data) vectorizer = CountVectorizer(stop_words='english', min_df=(n-1) / n,tokenizer=StemTokenizer()) counts = vectorizer.fit_transform(text for p, text in data).toarray() print('Vectorization done.') print (counts) for x in range(95): file_name = str(x+1)+".txt" # print (text) for i, (text) in enumerate(data): print (file_name) # print (text) with codecs.open(file_name,"rU","utf-8") as myfile: text = myfile.read() result = process_text(counts, vectorizer, text, file_name, i) print (result) if __name__ == '__main__': main()
Looks like you've got a bunch of files named 1.txt, 2.txt, ... 95.txt, and you want to find words that occur in one file only. I'd just gather all words, counting how many files each one occurs in; and print out the singletons. from collections import Counter import re fileids = [ str(n+1)+".txt" for n in range(95) ] filecounts = Counter() for fname in fileids: with open(fname) as fp: # Add encoding if really needed text = fp.read().lower() words = re.split(r"\W+", text) # Keep letters, drop the rest filecounts.update(set(words)) singletons = [ word in filecounts if filecounts[word] == 1 ] print(" ".join(singletons)) Done. You don't need scikit, you don't need the nltk, you don't need a pile of IR algorithms. You can use the list of singletons in an IR algorithm, but that's a different story.
def parseText(): # oFile: text file to test # myWord: word we are looking for # Get all lines into list aLines = oFile.readlines() # Perform list comprehension on lines to test if the word is found for sLine in aLines: # Parse the line (remove spaces), returns list aLine = sLine.split() # Iterate words and test to see if they match our word for sWord in aLines: # if it matches, append it to our list if sWord == myWord: aWords.append( sWord ) # Create empty list to store all instances of the word that we may find aWords = [] # Prompt user to know what word to search myWord = str( raw_input( 'what word to searh:' ) ) # Call function parseText() # Check if list has at least one element if len( aWords ) < 1: print 'Word not found in file' else: print str( len( aWords ) ) + ' instances of our word found in file'
cleaning text files in python 2 : TypeError: coercing to Unicode:
I am trying to clean up so text files in python. I want to take out stop words, digits and the new line character. But I keep getting coercing to Unicode python text . Here is my code: import nltk from nltk import word_tokenize from nltk.corpus import stopwords import string from string import digits def cleanupDoc(s): s = s.translate(None,digits) s = s.rstrip('\n') stopset = set(stopwords.words('english')) tokens = nltk.word_tokenize(s) cleanup = " ".join(filter(lambda word: word not in stopset, s.split())) return cleanup flist=glob.glob('/home/uiucinfo/Desktop/*txt') mylist=[] for fname in flist: tfile = open(fname, 'r+') line = tfile.readlines() #line = cleanupDoc(line) mylist.append(line) for fdoc in mylist: doc = open(fdoc) newDoc = cleanupDoc(doc) doc.close() My Error Traceback (most recent call last): File "<stdin>", line 3, in <module> TypeError: coercing to Unicode: need string or buffer, list found
tfile.readlines() gives you a list of lines, which you are appending to another list: for fname in flist: tfile = open(fname, 'r+') line = tfile.readlines() mylist.append(line) In result, you have a list of lists in mylist. The following should fix the problem: for fname in flist: tfile = open(fname, 'r+') line = tfile.readlines() mylist += line This will give you a list of strings in mylist.
import nltk form nltk import word_tokenize from nltk.corpus import stopwords #nltk.download() import string from string import digits import glob import re def cleanupDoc(s): #s = s.translate(None,digits) #s = s.rstrip('\n') stopset = set(stopwords.words('english')) tokens = nltk.word_tokenize(s) cleanup = " ".join(filter(lambda word: word not in stopset, s.split())) return cleanup flist=glob.glob('/home/uiucinfo/Desktop/*txt') mylist=[] for fname in flist: tfile = open(fname, 'r+') line = tfile.readlines() #line = cleanupDoc(line) mylist.append(line) for fdoc in mylist: # remove \n or digit from fdoc fdoc = [re.sub(r'[\"\n]|\d', '', x) for x in fdoc] # convert list to string fdoc = ''.join(fdoc) print fdoc newDoc = cleanupDoc(fdoc) print " newDoc: " , newDoc