I want to tokenize and sort reviews by keywords, but there is a problem with opening json and programs throw an error: JSONDecodeError: Extra data: line 1 column 884 (char 883).
The files test2.json and keywords.txt are here:
https://github.com/SilverYar/TransportDataMiner
Here is my code:
import nltk
from nltk.stem.snowball import RussianStemmer
from nltk.corpus import stopwords
import nltk, string, json
st = RussianStemmer()
def tokenize_me(file_text):
#applying nltk tokenization
tokens = nltk.word_tokenize(file_text)
#deleting punctuation symbols
tokens = [i for i in tokens if (i not in string.punctuation)]
#deleting stop_words
stop_words = stopwords.words('russian')
stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])
tokens = [i for i in tokens if (i not in stop_words)]
#cleaning words
tokens = [i.replace('«', '').replace('»', '') for i in tokens]
return tokens
with open('C:\\Creme\\token\\keywords.txt') as fin:
ww = fin.read().split(', ')
key_words = list(set([st.stem(w) for w in ww]))
with open('C:\\Creme\\token\\test2.json') as fin:
text = json.load(fin)
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
for dd in text:
#for d in dd:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
#break
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
json.dump(dd, fout)
fout.write('\n')
Help me find the error and fix the code.
You may not do this, because it leads to malformed JSON file:
for dd in text:
if tt:
json.dump(dd, fout) # <<-- cannot do this in the loop
fout.write('\n')
Basically it should be written all at once, with a single dump() or dumps() call.
ok, you have to make the large list first, then output it to the file:
bad_words_list = []
for dd in text:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
bad_words_list.append( dd )
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
json.dump( bad_words_list, fout )
I decided a little differently - read in the file and format the string into correct json formats:
with open('C:\\Creme\\token\\test2.json', 'r', encoding='utf8') as fin:
data = fin.read()
formated_text = data.replace('}{', '},{')
text = json.loads(f'[{formated_text}]')
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
for dd in text:
#for d in dd:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
#break
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
json.dump(dd, fout)
fout.write('\n')
Related
I am working on a text file right now that is called "dracula.txt", and I have to do the following in python:
Save words that occur no more than 3 times in descending order in a file called less_common_words.txt. Each word with its count should be saved on a separate line.
I would appreciate any help! I've been working on this for too long.
I have already tokenized my file and counted the words. This is my code so far:
file = open("C:/Users/17733/Downloads/dracula.txt", 'r', encoding = 'utf-8-sig')
data = file.read()
data
data_list = data.split('\n')
data_list
new_list = []
for i in data_list:
if i !='':
ans_here = i.split(' ')
new_list.extend(ans_here)
new_list
import string
import re
puncs = list(string.punctuation)
puncs.append('"')
puncs.append('[')
puncs.append('.')
puncs.append('-')
puncs.append('_')
#append each seperately
new_2 = []
for i in new_list:
for p in puncs:
if p in i:
i_new = i.replace(p, ' ')
new_2.append(i_new)
new_2
new_2 = [i.replace(' ', ' ').strip().lower() for i in new_2]
new_2
from pathlib import Path
from collections import Counter
import string
filepath = Path('test.txt')
output_filepath = Path('outfile.txt')
# print(filepath.exists())
with open(filepath) as f:
content = f.readlines()
word_list = sum((
(s.lower().strip('\n').translate(str.maketrans('', '', string.punctuation))).split(' ')
for s in content
), [])
less_common_words = sorted([
key for key, value in Counter(word_list).items() if value <= 3
],reverse=True)
with open(output_filepath, mode='wt', encoding='utf-8') as myfile:
myfile.write('\n'.join(less_common_words))
This should exactly be what you need- I fixed my previous error by flattening the entire txt into a 2d list:
book_open = open('frankenstein.txt', 'r').readlines()
beauty_book = [i.split() for i in book_open]
flatten = []
for sublist in beauty_book:
for val in sublist:
flatten.append(val)
foo = 0
for i in flatten:
list_open = open('less_common_words.txt', 'r').readlines()
beauty_list = [i.replace('\n', '') for i in list_open]
count = flatten.count(flatten[foo])
compile = str((flatten[foo], count))
if count <= 3:
if compile not in beauty_list:
file = open('less_common_words.txt', 'a+')
file.write('\n'+compile)
file.close()
foo += 1
I'm trying to process data(remove hastag, link and #) from CSV files and stored it back to CSV. however the output does not perform well. it is separated with a comma for each character. Can anyone help me to write it to csv in a better way. thank you
import re,string
import csv
def strip_links(text):
link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:##%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
links = re.findall(link_regex, text)
for link in links:
text = text.replace(link[0], ', ')
return text
def strip_all_entities(text):
entity_prefixes = ['#','#']
for separator in string.punctuation:
if separator not in entity_prefixes :
text = text.replace(separator,' ')
words = []
for word in text.split():
word = word.strip()
if word:
if word[0] not in entity_prefixes:
words.append(word)
return ' '.join(words)
f = open('Test.csv')
csf_f = csv.reader(f)
temp =[]
for row in csf_f:
temp.append(row[0])
temp1 = []
for t in temp:
temp1.append(strip_all_entities(strip_links(t)))
for i in range(0, len(temp1)):
with open('MYOUTPUT.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(temp1)
f.close()
I have two text files. The 1st file contains English sentences and 2nd file contains a number of English words (vocabulary). I want to remove those words from the sentences in the 1st file which are not present in the vocabulary and then to save the processed text back into the 1st file.
I wrote the code from which I am able to get those sentences which contains the words that are not available in our 2nd file (vocabulary).
Here is my code:
s = open('eng.txt').readlines()
for i in s:
print(i)
for word in i.split(' '):
print(word)
if word in open("vocab30000.txt").read():
print("Word exist in vocab")
else:
#print("I:", i)
print("Word does not exist")
#search_in_file_func(i)
print("I:", i)
file1 = open("MyFile.txt","a+")
if i in file1:
print("Sentence already exist")
else:
file1.write(i)
However, I am not able to remove those words.
This should work:
with open('vocab30000.txt') as f:
vocabulary = set(word.strip() for word in f.readlines())
with open('eng.txt', 'r+') as f:
data = [line.strip().split(' ') for line in f.readlines()]
removed = [[word for word in line if word in vocabulary] for line in data]
result = '\n'.join(' '.join(word for word in line) for line in removed)
f.seek(0)
f.write(result)
f.truncate()
#Read the two files
with open('vocab30000.txt') as f:
vocabulary = f.readlines()
with open('eng.txt', 'r+') as f:
eng = f.readlines()
vocab_sentences = [i.split(" ") for i in vocabulary]
eng = [i.split(" ") for i in eng]
cleaned_sentences = []
# loop over the sentences and exclude words in eng
for sent in vocab_sentences:
cleaned_sentences.append(" ".join([i for i in sent if i not in eng]))
#write the file
with open('vocab30000.txt', 'w') as f:
f.writelines(cleaned_sentences)
You can try this code. I tried not to use any loops to save your runtime if you have larger files.
import re
with open('eng.txt', 'r') as f:
s = f.read()
s_copy = s
punctuation = [".","\"",",","-","(",")","[","]"]
pattern = re.compile("\\b("+"|".join(punctuation)+")\\W", re.I)
s_copy = pattern.sub(" ", s_copy)
s_copy = s_copy.replace("\"","")
s_words = s_copy.split(" ")
with open('vocab30000.txt', 'r') as f:
check_words = f.read()
remove_words = list(set(s_words) - set(check_words))
pattern = re.compile("\\b("+"|".join(remove_words[1:])+")\\W", re.I)
pattern.sub("", s)
I am new to python and if I am asking a very simple question, please excuse.
I am trying to read each line from a text file and predict the sentiment of each line and write the output to the end of the text file. For that I am trying to append data to the end of the line.
My text file looks like below :
I am awesome.
I am terrible.
I am bad.
What I am trying to achieve is below :
I am awesome. - Positive
I am terrible. - Negative
I am bad. - Negative
When I run the code, the file is being saved as empty. Please help.
My code is as below :
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names
def word_feats(words):
return dict([(word, True) for word in words])
positive_vocab = ['awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)']
negative_vocab = ['bad', 'terrible', 'useless', 'hate', ':(']
positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
train_set = negative_features + positive_features
classifier = NaiveBayesClassifier.train(train_set)
# Predict
neg = 0
pos = 0
f = open("test.txt", "r")
for sentence in f.readlines():
sentence = sentence.lower()
words = sentence.split(' ')
for word in words:
classResult = classifier.classify(word_feats(word))
if classResult == 'neg':
f.write(' negative')
if classResult == 'pos':
f.write(' positive')
f.close()
You can't write to a file that is open in 'r' mode - that mode is for reading.
My suggestion is to open the file for reading, and open a second file and write out to that. So something like:
f = open("test.txt", "r")
out_file = open("output.txt", "w")
for sentence in f.readlines():
orig = sentence
sentence = sentence.lower()
words = sentence.split(' ')
for word in words:
classResult = classifier.classify(word_feats(word))
if classResult == 'neg':
out_file.write(orig + ' negative')
if classResult == 'pos':
out_file.write(orig + ' positive')
f.close()
out_file.close()
You are opening the file in read mode. You would need to open the file in write.
f = open('test.txt', 'w')
I am trying to clean up so text files in python. I want to take out stop words, digits and the new line character. But I keep getting coercing to Unicode python text . Here is my code:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from string import digits
def cleanupDoc(s):
s = s.translate(None,digits)
s = s.rstrip('\n')
stopset = set(stopwords.words('english'))
tokens = nltk.word_tokenize(s)
cleanup = " ".join(filter(lambda word: word not in stopset, s.split()))
return cleanup
flist=glob.glob('/home/uiucinfo/Desktop/*txt')
mylist=[]
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
#line = cleanupDoc(line)
mylist.append(line)
for fdoc in mylist:
doc = open(fdoc)
newDoc = cleanupDoc(doc)
doc.close()
My Error
Traceback (most recent call last):
File "<stdin>", line 3, in <module>
TypeError: coercing to Unicode: need string or buffer, list found
tfile.readlines() gives you a list of lines, which you are appending to another list:
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
mylist.append(line)
In result, you have a list of lists in mylist.
The following should fix the problem:
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
mylist += line
This will give you a list of strings in mylist.
import nltk
form nltk import word_tokenize
from nltk.corpus import stopwords
#nltk.download()
import string
from string import digits
import glob
import re
def cleanupDoc(s):
#s = s.translate(None,digits)
#s = s.rstrip('\n')
stopset = set(stopwords.words('english'))
tokens = nltk.word_tokenize(s)
cleanup = " ".join(filter(lambda word: word not in stopset, s.split()))
return cleanup
flist=glob.glob('/home/uiucinfo/Desktop/*txt')
mylist=[]
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
#line = cleanupDoc(line)
mylist.append(line)
for fdoc in mylist:
# remove \n or digit from fdoc
fdoc = [re.sub(r'[\"\n]|\d', '', x) for x in fdoc]
# convert list to string
fdoc = ''.join(fdoc)
print fdoc
newDoc = cleanupDoc(fdoc)
print " newDoc: " , newDoc