Count of Specific words in Multiple text files - python

I have a multiple text files and I need to find and cound specific words in those files and write them in a csv file. Column A contains the txt file names and in the header the words and for each file name its count. With this code I am getting all the words and need to filter out exact words
for example the output should be like the image file I uploaded
header = ['Abuse', 'Accommodating', 'Accommodation', 'Accountability']
import csv
folderpaths = 'C:/Users/haris/Downloads/PDF/'
counter = Counter()
filepaths = glob(os.path.join(folderpaths,'*.txt'))
for file in filepaths:
with open(file) as f:
words = re.findall(r'\w+', f.read().lower())
counter = counter + Counter(words)
print(counter)
f = open('C:/Users/haris/Downloads/PDF/firstcsv.csv', 'w')
writer = csv.writer(f)
for row in counter.items():
writer.writerow(row)
Files uploaded to google drive

Edit: As per your new request, I have added the "total_words" column. The code has been updated.
Below is a code that works. Just change the "folderpath" variable to the path of the folder with the text files, and change the "target_file" variable to where you want the output csv file to be created.
Sample csv output:
Code:
from collections import Counter
import glob
import os
import re
header = ['annual', 'investment', 'statement', 'range' , 'deposit' , 'supercalifragilisticexpialidocious']
folderpath = r'C:\Users\USERname4\Desktop\myfolder'
target_file = r'C:\Users\USERname4\Desktop\mycsv.csv'
queueWAP = []
def writeAndPrint(fileObject,toBeWAP,opCode=0):
global queueWAP
if (opCode == 0):
fileObject.write(toBeWAP)
print(toBeWAP)
if (opCode == 1):
queueWAP.append(toBeWAP)
if (opCode == 2):
for temp4 in range(len(queueWAP)):
fileObject.write(queueWAP[temp4])
print(queueWAP[temp4])
queueWAP = []
mycsvfile = open(target_file, 'w')
writeAndPrint(mycsvfile,"file_name,total_words")
for temp1 in header:
writeAndPrint(mycsvfile,","+temp1)
writeAndPrint(mycsvfile,"\n")
filepaths = glob.glob(folderpath + r"\*.txt")
for file in filepaths:
with open(file) as f:
writeAndPrint(mycsvfile,file.split("\\")[-1])
counter = Counter()
words = re.findall(r'\w+', f.read().lower())
counter = counter + Counter(words)
for temp2 in header:
temp3 = False
temp5 = 0
for myword in counter.items():
temp5 = temp5 + 1
if myword[0] == temp2:
writeAndPrint(mycsvfile,","+str(myword[1]),1)
temp3 = True
if temp3 == False:
writeAndPrint(mycsvfile,","+"0",1)
writeAndPrint(mycsvfile,","+str(temp5))
writeAndPrint(mycsvfile,"",2)
writeAndPrint(mycsvfile,"\n")
mycsvfile.close()

Using 'Counter' seems to be the right choice here, but I think you are using it wrong.
Here is a possible solution that may work for you:
words = ['Abuse', 'Accommodating', 'Accommodation', 'Accountability']
rows = []
for file in filepaths:
with open(file, 'r') as f:
words_in_file = [word for line in f for word in line.split()]
# this will count all the words in the file (not optimal)
wordcounts = Counter(words_in_file)
# interested only in specific words
counts = list(map(lambda x: wordcounts[x], words))
# insert first column (filenam)
counts.insert(0, file)
# append it to the rest of the rows
rows.append(counts)
f = open('C:/Users/haris/Downloads/PDF/firstcsv.csv', 'w')
writer = csv.writer(f)
for row in rows:
writer.writerow(row)

Related

How to save words that occur no more than 3 times in a text? Reading and writing files

I am working on a text file right now that is called "dracula.txt", and I have to do the following in python:
Save words that occur no more than 3 times in descending order in a file called less_common_words.txt. Each word with its count should be saved on a separate line.
I would appreciate any help! I've been working on this for too long.
I have already tokenized my file and counted the words. This is my code so far:
file = open("C:/Users/17733/Downloads/dracula.txt", 'r', encoding = 'utf-8-sig')
data = file.read()
data
data_list = data.split('\n')
data_list
new_list = []
for i in data_list:
if i !='':
ans_here = i.split(' ')
new_list.extend(ans_here)
new_list
import string
import re
puncs = list(string.punctuation)
puncs.append('"')
puncs.append('[')
puncs.append('.')
puncs.append('-')
puncs.append('_')
#append each seperately
new_2 = []
for i in new_list:
for p in puncs:
if p in i:
i_new = i.replace(p, ' ')
new_2.append(i_new)
new_2
new_2 = [i.replace(' ', ' ').strip().lower() for i in new_2]
new_2
from pathlib import Path
from collections import Counter
import string
filepath = Path('test.txt')
output_filepath = Path('outfile.txt')
# print(filepath.exists())
with open(filepath) as f:
content = f.readlines()
word_list = sum((
(s.lower().strip('\n').translate(str.maketrans('', '', string.punctuation))).split(' ')
for s in content
), [])
less_common_words = sorted([
key for key, value in Counter(word_list).items() if value <= 3
],reverse=True)
with open(output_filepath, mode='wt', encoding='utf-8') as myfile:
myfile.write('\n'.join(less_common_words))
This should exactly be what you need- I fixed my previous error by flattening the entire txt into a 2d list:
book_open = open('frankenstein.txt', 'r').readlines()
beauty_book = [i.split() for i in book_open]
flatten = []
for sublist in beauty_book:
for val in sublist:
flatten.append(val)
foo = 0
for i in flatten:
list_open = open('less_common_words.txt', 'r').readlines()
beauty_list = [i.replace('\n', '') for i in list_open]
count = flatten.count(flatten[foo])
compile = str((flatten[foo], count))
if count <= 3:
if compile not in beauty_list:
file = open('less_common_words.txt', 'a+')
file.write('\n'+compile)
file.close()
foo += 1

Python: Counting words from a directory of txt files and writing word counts to a separate txt file

New to Python and I'm trying to count the words in a directory of text files and write the output to a separate text file. However, I want to specify conditions. So if word count is > 0 is would like to write the count and file path to one file and if the count is == 0. I would like to write the count and file path to a separate file. Below is my code so far. I think I'm close, but I'm hung up on how to do the conditions and separate files. Thanks.
import sys
import os
from collections import Counter
import glob
stdoutOrigin=sys.stdout
sys.stdout = open("log.txt", "w")
def count_words_in_dir(dirpath, words, action=None):
for filepath in glob.iglob(os.path.join("path", '*.txt')):
with open(filepath) as f:
data = f.read()
for key,val in words.items():
#print("key is " + key + "\n")
ct = data.count(key)
words[key] = ct
if action:
action(filepath, words)
def print_summary(filepath, words):
for key,val in sorted(words.items()):
print(filepath)
if val > 0:
print('{0}:\t{1}'.format(
key,
val))
filepath = sys.argv[1]
keys = ["x", "y"]
words = dict.fromkeys(keys,0)
count_words_in_dir(filepath, words, action=print_summary)
sys.stdout.close()
sys.stdout=stdoutOrigin
I would strongly urge you to not repurpose stdout for writing data to a file as part of the normal course of your program. I also wonder how you can ever have a word "count < 0". I assume you meant "count == 0".
The main problem that your code has is in this line:
for filepath in glob.iglob(os.path.join("path", '*.txt')):
The string constant "path" I'm pretty sure doesn't belong there. I think you want filepath there instead. I would think that this problem would prevent your code from working at all.
Here's a version of your code where I fixed these issues and added the logic to write to two different output files based on the count:
import sys
import os
import glob
out1 = open("/tmp/so/seen.txt", "w")
out2 = open("/tmp/so/missing.txt", "w")
def count_words_in_dir(dirpath, words, action=None):
for filepath in glob.iglob(os.path.join(dirpath, '*.txt')):
with open(filepath) as f:
data = f.read()
for key, val in words.items():
# print("key is " + key + "\n")
ct = data.count(key)
words[key] = ct
if action:
action(filepath, words)
def print_summary(filepath, words):
for key, val in sorted(words.items()):
whichout = out1 if val > 0 else out2
print(filepath, file=whichout)
print('{0}: {1}'.format(key, val), file=whichout)
filepath = sys.argv[1]
keys = ["country", "friend", "turnip"]
words = dict.fromkeys(keys, 0)
count_words_in_dir(filepath, words, action=print_summary)
out1.close()
out2.close()
Result:
file seen.txt:
/Users/steve/tmp/so/dir/data2.txt
friend: 1
/Users/steve/tmp/so/dir/data.txt
country: 2
/Users/steve/tmp/so/dir/data.txt
friend: 1
file missing.txt:
/Users/steve/tmp/so/dir/data2.txt
country: 0
/Users/steve/tmp/so/dir/data2.txt
turnip: 0
/Users/steve/tmp/so/dir/data.txt
turnip: 0
(excuse me for using some search words that were a bit more interesting than yours)
Hello I hope I understood your question correctly, this code will count how many different words are in your file and depending on the conditions will do something you want.
import os
all_words = {}
def count(file_path):
with open(file_path, "r") as f:
# for better performance it is a good idea to go line by line through file
for line in f:
# singles out all the words, by splitting string around spaces
words = line.split(" ")
# and checks if word already exists in all_words dictionary...
for word in words:
try:
# ...if it does increment number of repetitions
all_words[word.replace(",", "").replace(".", "").lower()] += 1
except Exception:
# ...if it doesn't create it and give it number of repetitions 1
all_words[word.replace(",", "").replace(".", "").lower()] = 1
if __name__ == '__main__':
# for every text file in your current directory count how many words it has
for file in os.listdir("."):
if file.endswith(".txt"):
all_words = {}
count(file)
n = len(all_words)
# depending on the number of words do something
if n > 0:
with open("count1.txt", "a") as f:
f.write(file + "\n" + str(n) + "\n")
else:
with open("count2.txt", "a") as f:
f.write(file + "\n" + str(n) + "\n")
if you want to count same word multiple times you can add up all values from dictionary or you can eliminate try-except block and count every word there.

Write to csv from modified csv files with python code

I'm trying to process data(remove hastag, link and #) from CSV files and stored it back to CSV. however the output does not perform well. it is separated with a comma for each character. Can anyone help me to write it to csv in a better way. thank you
import re,string
import csv
def strip_links(text):
link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:##%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
links = re.findall(link_regex, text)
for link in links:
text = text.replace(link[0], ', ')
return text
def strip_all_entities(text):
entity_prefixes = ['#','#']
for separator in string.punctuation:
if separator not in entity_prefixes :
text = text.replace(separator,' ')
words = []
for word in text.split():
word = word.strip()
if word:
if word[0] not in entity_prefixes:
words.append(word)
return ' '.join(words)
f = open('Test.csv')
csf_f = csv.reader(f)
temp =[]
for row in csf_f:
temp.append(row[0])
temp1 = []
for t in temp:
temp1.append(strip_all_entities(strip_links(t)))
for i in range(0, len(temp1)):
with open('MYOUTPUT.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(temp1)
f.close()

How to search for a string in part of text?

I am trying to search multiple text files for the text "1-2","2-3","3-H" which occur in the last field of the lines of text that start with "play".
An example of the text file is show below
id,ARI201803290
version,2
info,visteam,COL
info,hometeam,ARI
info,site,PHO01
play,1,0,lemad001,22,CFBBX,HR/78/F
play,1,0,arenn001,20,BBX,S7/L+
play,1,0,stort001,12,SBCFC,K
play,1,0,gonzc001,02,SS>S,K
play,1,1,perad001,32,BTBBCX,S9/G
play,1,1,polla001,02,CSX,S7/L+.1-2
play,1,1,goldp001,32,SBFBBB,W.2-3;1-2
play,1,1,lambj001,00,X,D9/F+.3-H;2-H;1-3
play,1,1,avila001,31,BC*BBX,31/G.3-H;2-3
play,2,0,grayj003,12,CC*BS,K
play,2,1,dysoj001,31,BBCBX,43/G
play,2,1,corbp001,31,CBBBX,43/G
play,4,1,avila001,02,SC1>X,S8/L.1-2
For the text file above, I would like the output to be '4' since there are 4 occurrences of "1-2","2-3" and "3-H" in total.
The code I have got so far is below, however I'm not sure where to start with writing a line of code to do this function.
import os
input_folder = 'files' # path of folder containing the multiple text files
# create a list with file names
data_files = [os.path.join(input_folder, file) for file in
os.listdir(input_folder)]
# open csv file for writing
csv = open('myoutput.csv', 'w')
def write_to_csv(line):
print(line)
csv.write(line)
j=0 # initialise as 0
count_of_plate_appearances=0 # initialise as 0
for file in data_files:
with open(file, 'r') as f: # use context manager to open files
for line in f:
lines = f.readlines()
i=0
while i < len(lines):
temp_array = lines[i].rstrip().split(",")
if temp_array[0] == "id":
j=0
count_of_plate_appearances=0
game_id = temp_array[1]
awayteam = lines[i+2].rstrip().split(",")[2]
hometeam = lines[i+3].rstrip().split(",")[2]
date = lines[i+5].rstrip().split(",")[2]
for j in range(i+46,i+120,1): #only check for plate appearances this when temp_array[0] == "id"
temp_array2 = lines[j].rstrip().split(",") #create new array to check for plate apperances
if temp_array2[0] == "play" and temp_array2[2] == "1": # plate apperance occurs when these are true
count_of_plate_appearances=count_of_plate_appearances+1
#print(count_of_plate_appearances)
output_for_csv2=(game_id,date,hometeam, awayteam,str(count_of_plate_appearances))
print(output_for_csv2)
csv.write(','.join(output_for_csv2) + '\n')
i=i+1
else:
i=i+1
j=0
count_of_plate_appearances=0
#quit()
csv.close()
Any suggestions on how I can do this? Thanks in advance!
You can use regex, I put your text in a file called file.txt.
import re
a = ['1-2', '2-3', '3-H'] # What you want to count
find_this = re.compile('|'.join(a)) # Make search string
count = 0
with open('file.txt', 'r') as f:
for line in f.readlines():
count += len(find_this.findall(line)) # Each findall returns the list of things found
print(count) # 7
or a shorter solution: (Credit to wjandrea for hinting the use of a generator)
import re
a = ['1-2', '2-3', '3-H'] # What you want to count
find_this = re.compile('|'.join(a)) # Make search string
with open('file.txt', 'r') as f:
count = sum(len(find_this.findall(line)) for line in f)
print(count) # 7

Output from function to text/CSV file?

I am counting the number of contractions in a certain set of presidential speeches, and want to output these contractions to a CSV or text file. Here's my code:
import urllib2,sys,os,csv
from bs4 import BeautifulSoup,NavigableString
from string import punctuation as p
from multiprocessing import Pool
import re, nltk
import requests
import math, functools
import summarize
reload(sys)
def processURL_short(l):
open_url = urllib2.urlopen(l).read()
item_soup = BeautifulSoup(open_url)
item_div = item_soup.find('div',{'id':'transcript'},{'class':'displaytext'})
item_str = item_div.text.lower()
return item_str
every_link_test = ['http://www.millercenter.org/president/obama/speeches/speech-4427',
'http://www.millercenter.org/president/obama/speeches/speech-4424',
'http://www.millercenter.org/president/obama/speeches/speech-4453',
'http://www.millercenter.org/president/obama/speeches/speech-4612',
'http://www.millercenter.org/president/obama/speeches/speech-5502']
data = {}
count = 0
for l in every_link_test:
content_1 = processURL_short(l)
for word in content_1.split():
word = word.strip(p)
if word in contractions:
count = count + 1
splitlink = l.split("/")
president = splitlink[4]
speech_num = splitlink[-1]
filename = "{0}_{1}".format(president,speech_num)
data[filename] = count
print count, filename
with open('contraction_counts.csv','w',newline='') as fp:
a = csv.writer(fp,delimiter = ',')
a.writerows(data)
Running that for loop prints out
79 obama_speech-4427
101 obama_speech-4424
101 obama_speech-4453
182 obama_speech-4612
224 obama_speech-5502
I want to export that to a text file, where the numbers on the left are one column, and the president/speech number are in the second column. My with statement just writes each individual row to a separate file, which is definitely suboptimal.
You can try something like this, this is a generic method, modify as you see fit
import csv
with open('somepath/file.txt', 'wb+') as outfile:
w = csv.writer(outfile)
w.writerow(['header1', 'header2'])
for i in you_data_structure: # eg list or dictionary i'm assuming a list structure
w.writerow([
i[0],
i[1],
])
or if a dictionary
import csv
with open('somepath/file.txt', 'wb+') as outfile:
w = csv.writer(outfile)
w.writerow(['header1', 'header2'])
for k, v in your_dictionary.items(): # eg list or dictionary i'm assuming a list structure
w.writerow([
k,
v,
])
Your problem is that you open the output file inside the loop in w mode, meaning that it is erased on each iteration. You can easily solve it in 2 ways:
mode the open outside of the loop (normal way). You will open the file only once, add a line on each iteration and close it when exiting the with block:
with open('contraction_counts.csv','w',newline='') as fp:
a = csv.writer(fp,delimiter = ',')
for l in every_link_test:
content_1 = processURL_short(l)
for word in content_1.split():
word = word.strip(p)
if word in contractions:
count = count + 1
splitlink = l.split("/")
president = splitlink[4]
speech_num = splitlink[-1]
filename = "{0}_{1}".format(president,speech_num)
data[filename] = count
print count, filename
a.writerows(data)
open the file in a (append) mode. On each iteration you reopen the file and write at the end instead of erasing it - this way uses more IO resources because of the open/close, and should be used only if the program can break and you want to be sure that all that was written before the crash has actually been saved to disk
for l in every_link_test:
content_1 = processURL_short(l)
for word in content_1.split():
word = word.strip(p)
if word in contractions:
count = count + 1
splitlink = l.split("/")
president = splitlink[4]
speech_num = splitlink[-1]
filename = "{0}_{1}".format(president,speech_num)
data[filename] = count
print count, filename
with open('contraction_counts.csv','a',newline='') as fp:
a = csv.writer(fp,delimiter = ',')
a.writerows(data)

Categories

Resources