I'm trying to process data(remove hastag, link and #) from CSV files and stored it back to CSV. however the output does not perform well. it is separated with a comma for each character. Can anyone help me to write it to csv in a better way. thank you
import re,string
import csv
def strip_links(text):
link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:##%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
links = re.findall(link_regex, text)
for link in links:
text = text.replace(link[0], ', ')
return text
def strip_all_entities(text):
entity_prefixes = ['#','#']
for separator in string.punctuation:
if separator not in entity_prefixes :
text = text.replace(separator,' ')
words = []
for word in text.split():
word = word.strip()
if word:
if word[0] not in entity_prefixes:
words.append(word)
return ' '.join(words)
f = open('Test.csv')
csf_f = csv.reader(f)
temp =[]
for row in csf_f:
temp.append(row[0])
temp1 = []
for t in temp:
temp1.append(strip_all_entities(strip_links(t)))
for i in range(0, len(temp1)):
with open('MYOUTPUT.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(temp1)
f.close()
Related
I have a multiple text files and I need to find and cound specific words in those files and write them in a csv file. Column A contains the txt file names and in the header the words and for each file name its count. With this code I am getting all the words and need to filter out exact words
for example the output should be like the image file I uploaded
header = ['Abuse', 'Accommodating', 'Accommodation', 'Accountability']
import csv
folderpaths = 'C:/Users/haris/Downloads/PDF/'
counter = Counter()
filepaths = glob(os.path.join(folderpaths,'*.txt'))
for file in filepaths:
with open(file) as f:
words = re.findall(r'\w+', f.read().lower())
counter = counter + Counter(words)
print(counter)
f = open('C:/Users/haris/Downloads/PDF/firstcsv.csv', 'w')
writer = csv.writer(f)
for row in counter.items():
writer.writerow(row)
Files uploaded to google drive
Edit: As per your new request, I have added the "total_words" column. The code has been updated.
Below is a code that works. Just change the "folderpath" variable to the path of the folder with the text files, and change the "target_file" variable to where you want the output csv file to be created.
Sample csv output:
Code:
from collections import Counter
import glob
import os
import re
header = ['annual', 'investment', 'statement', 'range' , 'deposit' , 'supercalifragilisticexpialidocious']
folderpath = r'C:\Users\USERname4\Desktop\myfolder'
target_file = r'C:\Users\USERname4\Desktop\mycsv.csv'
queueWAP = []
def writeAndPrint(fileObject,toBeWAP,opCode=0):
global queueWAP
if (opCode == 0):
fileObject.write(toBeWAP)
print(toBeWAP)
if (opCode == 1):
queueWAP.append(toBeWAP)
if (opCode == 2):
for temp4 in range(len(queueWAP)):
fileObject.write(queueWAP[temp4])
print(queueWAP[temp4])
queueWAP = []
mycsvfile = open(target_file, 'w')
writeAndPrint(mycsvfile,"file_name,total_words")
for temp1 in header:
writeAndPrint(mycsvfile,","+temp1)
writeAndPrint(mycsvfile,"\n")
filepaths = glob.glob(folderpath + r"\*.txt")
for file in filepaths:
with open(file) as f:
writeAndPrint(mycsvfile,file.split("\\")[-1])
counter = Counter()
words = re.findall(r'\w+', f.read().lower())
counter = counter + Counter(words)
for temp2 in header:
temp3 = False
temp5 = 0
for myword in counter.items():
temp5 = temp5 + 1
if myword[0] == temp2:
writeAndPrint(mycsvfile,","+str(myword[1]),1)
temp3 = True
if temp3 == False:
writeAndPrint(mycsvfile,","+"0",1)
writeAndPrint(mycsvfile,","+str(temp5))
writeAndPrint(mycsvfile,"",2)
writeAndPrint(mycsvfile,"\n")
mycsvfile.close()
Using 'Counter' seems to be the right choice here, but I think you are using it wrong.
Here is a possible solution that may work for you:
words = ['Abuse', 'Accommodating', 'Accommodation', 'Accountability']
rows = []
for file in filepaths:
with open(file, 'r') as f:
words_in_file = [word for line in f for word in line.split()]
# this will count all the words in the file (not optimal)
wordcounts = Counter(words_in_file)
# interested only in specific words
counts = list(map(lambda x: wordcounts[x], words))
# insert first column (filenam)
counts.insert(0, file)
# append it to the rest of the rows
rows.append(counts)
f = open('C:/Users/haris/Downloads/PDF/firstcsv.csv', 'w')
writer = csv.writer(f)
for row in rows:
writer.writerow(row)
I am working on a text file right now that is called "dracula.txt", and I have to do the following in python:
Save words that occur no more than 3 times in descending order in a file called less_common_words.txt. Each word with its count should be saved on a separate line.
I would appreciate any help! I've been working on this for too long.
I have already tokenized my file and counted the words. This is my code so far:
file = open("C:/Users/17733/Downloads/dracula.txt", 'r', encoding = 'utf-8-sig')
data = file.read()
data
data_list = data.split('\n')
data_list
new_list = []
for i in data_list:
if i !='':
ans_here = i.split(' ')
new_list.extend(ans_here)
new_list
import string
import re
puncs = list(string.punctuation)
puncs.append('"')
puncs.append('[')
puncs.append('.')
puncs.append('-')
puncs.append('_')
#append each seperately
new_2 = []
for i in new_list:
for p in puncs:
if p in i:
i_new = i.replace(p, ' ')
new_2.append(i_new)
new_2
new_2 = [i.replace(' ', ' ').strip().lower() for i in new_2]
new_2
from pathlib import Path
from collections import Counter
import string
filepath = Path('test.txt')
output_filepath = Path('outfile.txt')
# print(filepath.exists())
with open(filepath) as f:
content = f.readlines()
word_list = sum((
(s.lower().strip('\n').translate(str.maketrans('', '', string.punctuation))).split(' ')
for s in content
), [])
less_common_words = sorted([
key for key, value in Counter(word_list).items() if value <= 3
],reverse=True)
with open(output_filepath, mode='wt', encoding='utf-8') as myfile:
myfile.write('\n'.join(less_common_words))
This should exactly be what you need- I fixed my previous error by flattening the entire txt into a 2d list:
book_open = open('frankenstein.txt', 'r').readlines()
beauty_book = [i.split() for i in book_open]
flatten = []
for sublist in beauty_book:
for val in sublist:
flatten.append(val)
foo = 0
for i in flatten:
list_open = open('less_common_words.txt', 'r').readlines()
beauty_list = [i.replace('\n', '') for i in list_open]
count = flatten.count(flatten[foo])
compile = str((flatten[foo], count))
if count <= 3:
if compile not in beauty_list:
file = open('less_common_words.txt', 'a+')
file.write('\n'+compile)
file.close()
foo += 1
I am attempting to loop through a series of text files in a directory, looking for occurences of certain types of words, and prefixing each found word with a user defined tag. My code is as follows.
ACC_Tagged_Test = 'C:/ACC_Tag_Test'
for filename in glob.glob(os.path.join(ACC_Tagged_Test, '*.txt')):
with open(filename) as f:
data = f.read()
data = data.lower()
modals = {"could":1, "would":1, "should":1, "can":1, "may":1, "might":1}
personal_attribute = {"believes":1, "guess":1, "surmise":1, "considers":1,
"presume":1, "speculate":1, "postulate":1, "surmised":1, "assume":1}
approx_adapt = {"broadly":1, "mainly":1, "mostly":1, "loosely":1,
"generally":1, "usually":1,"typically":1, "regularly":1, "widely":1}
plaus_shields = {"wonder":1, "suspect":1, "theorize":1, "hypothesize":1,
"cogitate":1, "contemplate":1, "deliberate":1}
format_modal = "<555>{} ".format
format_attribute = "<666>{} ".format
format_app_adaptor = "<777>{} ".format
format_plaus_shield = "<888>{} ".format
data = " ".join(format_modal(word) if word in modals else word for word in data.split())
data = " ".join(format_attribute(word) if word in personal_attribute else word for word in data.split())
data = " ".join(format_app_adaptor(word) if word in approx_adapt else word for word in data.split())
data = " ".join(format_plaus_shield(word) if word in plaus_shields else word for word in data.split())
with open (filename, "w") as f:
f.write(str(data))
print(data) # This is just added in order to check on screen all files
# Are being processed.
My problem is that although code works on the last file in the directory it is not working on the previous files (1 out of 10 in this) I've tried a second For loop above the file write out statements but that is not working at all. Can anyone explain what I'm doing wrong here?
regards
My speculation is your code is only showing the last file because it's
not indented properly to have all relevant code within the for loop.
Try with this indentation:
ACC_Tagged_Test = 'C:/ACC_Tag_Test'
for filename in glob.glob(os.path.join(ACC_Tagged_Test, '*.txt')):
with open(filename) as f:
data = f.read()
data = data.lower()
modals = {"could":1, "would":1, "should":1, "can":1, "may":1, "might":1}
personal_attribute = {"believes":1, "guess":1, "surmise":1, "considers":1,
"presume":1, "speculate":1, "postulate":1, "surmised":1, "assume":1}
approx_adapt = {"broadly":1, "mainly":1, "mostly":1, "loosely":1,
"generally":1, "usually":1,"typically":1, "regularly":1, "widely":1}
plaus_shields = {"wonder":1, "suspect":1, "theorize":1, "hypothesize":1,
"cogitate":1, "contemplate":1, "deliberate":1}
format_modal = "<555>{} ".format
format_attribute = "<666>{} ".format
format_app_adaptor = "<777>{} ".format
format_plaus_shield = "<888>{} ".format
data = " ".join(format_modal(word) if word in modals else word for word in data.split())
data = " ".join(format_attribute(word) if word in personal_attribute else word for word in data.split())
data = " ".join(format_app_adaptor(word) if word in approx_adapt else word for word in data.split())
data = " ".join(format_plaus_shield(word) if word in plaus_shields else word for word in data.split())
with open (filename, "w") as f:
f.write(str(data))
print(data) # This is just added in order to check on screen all files
# Are being processed.
Assuming all of your code is supposed to be in your for loop. You are overriding your text file, therefore it looks like only your last run is working:
#this overrides the file
with open(filename, "w") as fh:
fh.write(str(data))
change to:
#this append to the file
with open(filename, "a") as fh:
fh.write(str(data))
This will append to your text file and will not override previous added data with the data from the last loop.
I want to tokenize and sort reviews by keywords, but there is a problem with opening json and programs throw an error: JSONDecodeError: Extra data: line 1 column 884 (char 883).
The files test2.json and keywords.txt are here:
https://github.com/SilverYar/TransportDataMiner
Here is my code:
import nltk
from nltk.stem.snowball import RussianStemmer
from nltk.corpus import stopwords
import nltk, string, json
st = RussianStemmer()
def tokenize_me(file_text):
#applying nltk tokenization
tokens = nltk.word_tokenize(file_text)
#deleting punctuation symbols
tokens = [i for i in tokens if (i not in string.punctuation)]
#deleting stop_words
stop_words = stopwords.words('russian')
stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])
tokens = [i for i in tokens if (i not in stop_words)]
#cleaning words
tokens = [i.replace('«', '').replace('»', '') for i in tokens]
return tokens
with open('C:\\Creme\\token\\keywords.txt') as fin:
ww = fin.read().split(', ')
key_words = list(set([st.stem(w) for w in ww]))
with open('C:\\Creme\\token\\test2.json') as fin:
text = json.load(fin)
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
for dd in text:
#for d in dd:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
#break
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
json.dump(dd, fout)
fout.write('\n')
Help me find the error and fix the code.
You may not do this, because it leads to malformed JSON file:
for dd in text:
if tt:
json.dump(dd, fout) # <<-- cannot do this in the loop
fout.write('\n')
Basically it should be written all at once, with a single dump() or dumps() call.
ok, you have to make the large list first, then output it to the file:
bad_words_list = []
for dd in text:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
bad_words_list.append( dd )
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
json.dump( bad_words_list, fout )
I decided a little differently - read in the file and format the string into correct json formats:
with open('C:\\Creme\\token\\test2.json', 'r', encoding='utf8') as fin:
data = fin.read()
formated_text = data.replace('}{', '},{')
text = json.loads(f'[{formated_text}]')
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
for dd in text:
#for d in dd:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
#break
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
json.dump(dd, fout)
fout.write('\n')
I have two text files. The 1st file contains English sentences and 2nd file contains a number of English words (vocabulary). I want to remove those words from the sentences in the 1st file which are not present in the vocabulary and then to save the processed text back into the 1st file.
I wrote the code from which I am able to get those sentences which contains the words that are not available in our 2nd file (vocabulary).
Here is my code:
s = open('eng.txt').readlines()
for i in s:
print(i)
for word in i.split(' '):
print(word)
if word in open("vocab30000.txt").read():
print("Word exist in vocab")
else:
#print("I:", i)
print("Word does not exist")
#search_in_file_func(i)
print("I:", i)
file1 = open("MyFile.txt","a+")
if i in file1:
print("Sentence already exist")
else:
file1.write(i)
However, I am not able to remove those words.
This should work:
with open('vocab30000.txt') as f:
vocabulary = set(word.strip() for word in f.readlines())
with open('eng.txt', 'r+') as f:
data = [line.strip().split(' ') for line in f.readlines()]
removed = [[word for word in line if word in vocabulary] for line in data]
result = '\n'.join(' '.join(word for word in line) for line in removed)
f.seek(0)
f.write(result)
f.truncate()
#Read the two files
with open('vocab30000.txt') as f:
vocabulary = f.readlines()
with open('eng.txt', 'r+') as f:
eng = f.readlines()
vocab_sentences = [i.split(" ") for i in vocabulary]
eng = [i.split(" ") for i in eng]
cleaned_sentences = []
# loop over the sentences and exclude words in eng
for sent in vocab_sentences:
cleaned_sentences.append(" ".join([i for i in sent if i not in eng]))
#write the file
with open('vocab30000.txt', 'w') as f:
f.writelines(cleaned_sentences)
You can try this code. I tried not to use any loops to save your runtime if you have larger files.
import re
with open('eng.txt', 'r') as f:
s = f.read()
s_copy = s
punctuation = [".","\"",",","-","(",")","[","]"]
pattern = re.compile("\\b("+"|".join(punctuation)+")\\W", re.I)
s_copy = pattern.sub(" ", s_copy)
s_copy = s_copy.replace("\"","")
s_words = s_copy.split(" ")
with open('vocab30000.txt', 'r') as f:
check_words = f.read()
remove_words = list(set(s_words) - set(check_words))
pattern = re.compile("\\b("+"|".join(remove_words[1:])+")\\W", re.I)
pattern.sub("", s)