I'm following this tutorial from the website: https://towardsdatascience.com/creating-the-twitter-sentiment-analysis-program-in-python-with-naive-bayes-classification-672e5589a7ed
Everything is good so far but I keep getting an error when trying to run this code.
def buildTrainingSet(corpusFile, tweetDataFile):
import csv
import time
corpus = []
with open(corpusFile,'rb') as csvfile:
lineReader = csv.reader(csvfile,delimiter=',', quotechar="\"")
for row in lineReader:
corpus.append({"tweet_id":row[2], "label":row[1], "topic":row[0]})
rate_limit = 180
sleep_time = 900/180
trainingDataSet = []
for tweet in corpus:
try:
status = twitter_api.GetStatus(tweet["tweet_id"])
print("Tweet fetched" + status.text)
tweet["text"] = status.text
trainingDataSet.append(tweet)
time.sleep(sleep_time)
except:
continue
# now we write them to the empty CSV file
with open(tweetDataFile,'wb') as csvfile:
linewriter = csv.writer(csvfile,delimiter=',',quotechar="\"")
for tweet in trainingDataSet:
try:
linewriter.writerow([tweet["tweet_id"], tweet["text"], tweet["label"], tweet["topic"]])
except Exception as e:
print(e)
return trainingDataSet
#================
corpusFile = "C:\Users\Vilma\Documents\CIS450\group prjt/corpus.csv"
tweetDataFile = "C:\Users\Vilma\Documents\CIS450\group prjt/tweetDataFile.csv"
trainingData = buildTrainingSet (corpusFile, tweetDataFile)
I keep getting this error:
File "<ipython-input-33-54fea359e8f9>", line 1
corpusFile = "C:\Users\Vilma\Documents\CIS450\group prjt/corpus.csv"
^
SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape
I even tried putting r' in front of C:\Users\Vilma\Documents\CIS450\group prjt/corpus.csvbut I still keeping getting error.
update: Fixed error, I put code as
corpusFile = r'C:\Users\Vilma\Documents\CIS450\group prjt\corpus.csv'
tweetDataFile = r'C:\Users\Vilma\Documents\CIS450\group prjt\tweetDataFile.csv'
However, a new error pops up:
File "<ipython-input-41-f44768dabc6e>", line 7, in buildTrainingSet
with open(corpusFile,'rb') as csvfile:
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Vilma\\Documents\\CIS450\\group prjt\\corpus.csv'
Try correcting your file path.
corpusFile = "C:\Users\Vilma\Documents\CIS450\group prjt/corpus.csv"
Should be:
corpusFile = "C:\Users\Vilma\Documents\CIS450\group prjt\corpus.csv"
Hope this helps!
You can use:
corpusFile = r"C:\Users\Vilma\Documents\CIS450\group prjt\corpus.csv"
If you are not finding the file, please make sure the file exists in the folder.
Related
I'm trying to open open a file using a CSV module but i recived this error
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x83 in position
7458: character maps to
I checked the file, and file encoding is UTF-8...
Below is my code. The error is in line 63
import csv
import xml.etree.ElementTree as ET
import xml.dom.minidom as PT
import traceback
#Global variables
#Variable to hold file name
FILE_NAME = "CustomLabels.labels"
#Variable to hold delimiter
DELIMETE = ','
#Variable to hold root category in xml hierarchy
CUSTOM_LABELS = "CustomLabels"
#Variable to hold sub element in xml
LABELS = "labels"
#Variable to hold argument for CustomLabels category
XMLNS = 'xmlns'
#Variable to hold value for argument for CustomLabels category
URL = "http://soap.sforce.com/2006/04/metadata"
#variable to save file
SAVE_PATH = ""
#variable to hold file to read name
FILE_TO_READ = "CustomLabels.csv"
#Function to open the file with ugly XML
def openFile():
print('D:M|***| openFile')
try:
customLabelsFile = open(FILE_NAME, 'r+',encoding="utf-8")
except Exception:
traceback.print_exc()
return customLabelsFile
#Function to make pretty XML on output
def prettyXMLfile():
print('D:M|***| prettyXMLfile')
try:
dom = PT.parse(FILE_NAME)
pretty_xml_as_string = dom.toprettyxml()
except Exception:
traceback.print_exc()
return pretty_xml_as_string
#Function to save preetyXML
#para
#xml_file - it is a file from openFile Function
#context - it is a formatted xml
def saveAsPrertyXML(xml_file,context):
try:
n = xml_file.write(context)
xml_file.close()
except Exception:
traceback.print_exc()
with open(FILE_TO_READ,encoding="utf-8",errors='ignore',"rb") as csv_file:
csv_reader = csv.reader(csv_file, encoding='utf-8',delimiter=DELIMETE)
line_count = 0
listOfColumnNames = list()
customLabels = ET.Element(CUSTOM_LABELS)
customLabels.set(XMLNS,URL)
try:
for row in csv_reader:
if line_count == 0:
listOfColumnNames.append(row)
finalListOfColumns = listOfColumnNames[line_count]
line_count += 1
else:
index = 0
while index < len(finalListOfColumns):
if index == 0:
labels = ET.SubElement(customLabels, LABELS)
ET.SubElement(labels, finalListOfColumns[index]).text = row[index]
index += 1
line_count += 1
except Exception:
print(f'The line with error is {line_count}')
traceback.print_exc()
tree = ET.ElementTree(customLabels)
tree.write(FILE_NAME, xml_declaration=True,encoding='utf-8',method="xml")
uglyXML = openFile()
prettyXMLasString = prettyXMLfile()
saveAsPrertyXML(uglyXML,prettyXMLasString)
print(f'Generator pars {line_count} lines')
print('XML file saved succesfull')
Ok i figure out what was wrong
it should be:
with open(FILE_TO_READ,"rt",encoding="utf-8") as csv_file:
instead of
with open(FILE_TO_READ,"rb+",encoding="utf-8") as csv_file:
I'm using a standard try/except syntax for skipping rows in a csv file that aren't streaming properly and therefore can't be downloaded. My code:
for row in list_reader:
media_id = row['mediaId']
filename = row['mediaId']
saveFile = media.get_item(media_id)
stream_url = saveFile['streams'][0]['streamLocation']
try:
r = requests.get(stream_url, allow_redirects=True)
with open(os.path.join('./media', filename), 'wb') as ofile:
ofile.write(r.content)
counter += 1
except:
IndexError
print "error"
However after downloading a number of files the problem row comes up, the error is not handled and I get the error:
Traceback (most recent call last):
File "downloadmedia.py", line 28, in <module>
stream_url = saveFile['streams'][0]['streamLocation']
IndexError: list index out of range
I've tried an if/else syntax instead, using the length of the stream_url variable, but this gives the same error. Can someone explain why the error handling doesn't work?
As stated in the comments, your try/except is in the wrong place. Through the error you provided, you can see that the index error occurs at the line stream_url = saveFile['streams'][0]['streamLocation']
You need to make sure the try/except is covering this line to prevent this.
for row in list_reader:
try:
media_id = row['mediaId']
filename = row['mediaId']
saveFile = media.get_item(media_id)
stream_url = saveFile['streams'][0]['streamLocation']
r = requests.get(stream_url, allow_redirects=True)
with open(os.path.join('./media', filename), 'wb') as ofile:
ofile.write(r.content)
counter += 1
except IndexError:
print "error"
I'm creating a new coloumn and this new file and want to save. But in there excel file a coloumn have a character. How can I skip this line the save process or change line to a correct character?
import pandas as pd
path = '/My Documents/Python/'
fileName = "test.xlsx"
# open the excel file
ef = pd.ExcelFile(path+fileName)
# read the contents
df = pd.read_excel(path+fileName, sheet_name=ef.sheet_names[0])
print(df['Content'])
print(df['Engine'])
i = 1
for test in df['Content']:
try:
print(i)
print(test)
except:
print("An exception occurred")
break
i += 1
df['Test'] = 'value'
df.to_excel('My Documents/Python/Test_NEW.xlsx')
Error message
data, consumed = self.encode(object, self.errors)
UnicodeEncodeError: 'utf-8' codec can't encode character '\ude7c' in position 470: surrogates not allowed
df['Content'] = df['Content'].astype(str)
When I use the CountVectorizer in sklearn, it needs the file encoding in unicode, but my data file is encoding in ansi.
I tried to change the encoding to unicode using notepad++, then I use readlines, it cannot read all the lines, instead it can only read the last line. After that, I tried to read the line into data file, and write them into the new file by using unicode, but I failed.
def merge_file():
root_dir="d:\\workspace\\minibatchk-means\\data\\20_newsgroups\\"
resname='resule_final.txt'
if os.path.exists(resname):
os.remove(resname)
result = codecs.open(resname,'w','utf-8')
num = 1
for back_name in os.listdir(r'd:\\workspace\\minibatchk-means\\data\\20_newsgroups'):
current_dir = root_dir + str(back_name)
for filename in os.listdir(current_dir):
print num ,":" ,str(filename)
num = num+1
path=current_dir + "\\" +str(filename)
source=open(path,'r')
line = source.readline()
line = line.strip('\n')
line = line.strip('\r')
while line !="":
line = unicode(line,"gbk")
line = line.replace('\n',' ')
line = line.replace('\r',' ')
result.write(line + ' ')
line = source.readline()
else:
print 'End file :'+ str(filename)
result.write('\n')
source.close()
print 'End All.'
result.close()
The error message is :UnicodeDecodeError: 'gbk' codec can't decode bytes in position 0-1: illegal multibyte sequence
Oh,I find the way.
First, use chardet to detect string encoding.
Second,use codecs to input or output to the file in the specific encoding.
Here is the code.
import chardet
import codecs
import os
root_dir="d:\\workspace\\minibatchk-means\\data\\20_newsgroups\\"
num = 1
failed = []
for back_name in os.listdir("d:\\workspace\\minibatchk-means\\data\\20_newsgroups"):
current_dir = root_dir + str(back_name)
for filename in os.listdir(current_dir):
print num,":",str(filename)
num=num+1
path=current_dir+"\\"+str(filename)
content = open(path,'r').read()
source_encoding=chardet.detect(content)['encoding']
if source_encoding == None:
print '??' , filename
failed.append(filename)
elif source_encoding != 'utf-8':
content=content.decode(source_encoding,'ignore')
codecs.open(path,'w',encoding='utf-8').write(content)
print failed
Thanks for all your help.
I am currently extracting comments from various subreddits on Reddit using praw and trying to calculate their sentiment and add them to a database.
It works by reading from a file that contains subreddit names in order to know which subreddit to pull the comments from.
with open('subs.txt') as f:
for line in f:
string = line.strip()
for submission in reddit.subreddit(string).hot(limit=10):
subreddit = reddit.subreddit(line.strip())
name = str(subreddit.display_name)
comments = submission.comments.list()
for c in comments:
if isinstance(c, MoreComments):
continue
#print c.body
author = c.author
score = c.score
created_at = c.created_utc
upvotes = c.ups
#print c.score
comment_sentiment = getSentiment(c.body)
subreddit_sentiment += comment_sentiment
num_comments += 1
What I have currently implemented works fine up until it reaches a certain comment where it throws the following error message:
UnicodeDecodeError: 'utf8' codec can't decode bytes in position 8-10: unexpected end of data
I have looked at a bunch of different questions on here where people have encountered the same issue, but the solutions given did not seem to help my problem.
The full stack trace is as follows:
Traceback (most recent call last):
File "extract.py", line 48, in <module>
comment_sentiment = getSentiment(c.body)
File "/Users/b38/Desktop/FlaskApp/sentiment_analysis.py", line 93, in getSentiment
tagged_sentences = makeTag(pos_tag_text, max_key_size, dictionary)
File "/Users/b38/Desktop/FlaskApp/sentiment_analysis.py", line 106, in makeTag
return [addTag(sentence, max_key_size, dictionary) for sentence in postagged_sentences]
File "/Users/b38/Desktop/FlaskApp/sentiment_analysis.py", line 119, in addTag
expression_word = ' '.join([word[0] for word in sentence[i:j]]).lower().encode('utf-8',errors='ignore')
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode bytes in position 8-10: unexpected end of data
I've been racking my brain trying to think of various ways to solve this issue and unfortunately I'm lost. Is it something to do with reading from the file containing the subreddits or would it be concerning the limit of pulling data with praw? I have tried to isolate the problem but can't seem to shake this error.
Would anyone be able to help me fix this issue? I would appreciate any insight.
Many thanks.
EDIT:
sentiment_analysis.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import pandas as pd
import nltk
import yaml
import sys
import os
import re
//splitting the text initially
def splitString(text):
nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()
sentences = nltk_splitter.tokenize(text)
tokenized_sentences = [nltk_tokenizer.tokenize(sentence) for sentence in sentences]
return tokenized_sentences
def tagWords(sentence,max_key_size, dictionary, tag_stem=False):
# Tag all possible sentences
tagged_sentence = []
length = len(sentence)
if max_key_size == 0:
max_key_size = length
i = 0
while (i < length):
j = min(i + max_key_size, length)
tagged = False
while (j > i):
expression_word = ' '.join([word[0] for word in sentence[i:j]]).lower().encode('utf-8',errors='ignore') // here is where it gets caught
expression_stem = ' '.join([word[1] for word in sentence[i:j]]).lower().encode('utf-8',errors='ignore')
if tag_stem == True:
word = expression_word
else:
word = expression_word
....
Try encoding the string explicitly:
c.body.encode('utf-8')