I am counting the number of contractions in a certain set of presidential speeches, and want to output these contractions to a CSV or text file. Here's my code:
import urllib2,sys,os,csv
from bs4 import BeautifulSoup,NavigableString
from string import punctuation as p
from multiprocessing import Pool
import re, nltk
import requests
import math, functools
import summarize
reload(sys)
def processURL_short(l):
open_url = urllib2.urlopen(l).read()
item_soup = BeautifulSoup(open_url)
item_div = item_soup.find('div',{'id':'transcript'},{'class':'displaytext'})
item_str = item_div.text.lower()
return item_str
every_link_test = ['http://www.millercenter.org/president/obama/speeches/speech-4427',
'http://www.millercenter.org/president/obama/speeches/speech-4424',
'http://www.millercenter.org/president/obama/speeches/speech-4453',
'http://www.millercenter.org/president/obama/speeches/speech-4612',
'http://www.millercenter.org/president/obama/speeches/speech-5502']
data = {}
count = 0
for l in every_link_test:
content_1 = processURL_short(l)
for word in content_1.split():
word = word.strip(p)
if word in contractions:
count = count + 1
splitlink = l.split("/")
president = splitlink[4]
speech_num = splitlink[-1]
filename = "{0}_{1}".format(president,speech_num)
data[filename] = count
print count, filename
with open('contraction_counts.csv','w',newline='') as fp:
a = csv.writer(fp,delimiter = ',')
a.writerows(data)
Running that for loop prints out
79 obama_speech-4427
101 obama_speech-4424
101 obama_speech-4453
182 obama_speech-4612
224 obama_speech-5502
I want to export that to a text file, where the numbers on the left are one column, and the president/speech number are in the second column. My with statement just writes each individual row to a separate file, which is definitely suboptimal.
You can try something like this, this is a generic method, modify as you see fit
import csv
with open('somepath/file.txt', 'wb+') as outfile:
w = csv.writer(outfile)
w.writerow(['header1', 'header2'])
for i in you_data_structure: # eg list or dictionary i'm assuming a list structure
w.writerow([
i[0],
i[1],
])
or if a dictionary
import csv
with open('somepath/file.txt', 'wb+') as outfile:
w = csv.writer(outfile)
w.writerow(['header1', 'header2'])
for k, v in your_dictionary.items(): # eg list or dictionary i'm assuming a list structure
w.writerow([
k,
v,
])
Your problem is that you open the output file inside the loop in w mode, meaning that it is erased on each iteration. You can easily solve it in 2 ways:
mode the open outside of the loop (normal way). You will open the file only once, add a line on each iteration and close it when exiting the with block:
with open('contraction_counts.csv','w',newline='') as fp:
a = csv.writer(fp,delimiter = ',')
for l in every_link_test:
content_1 = processURL_short(l)
for word in content_1.split():
word = word.strip(p)
if word in contractions:
count = count + 1
splitlink = l.split("/")
president = splitlink[4]
speech_num = splitlink[-1]
filename = "{0}_{1}".format(president,speech_num)
data[filename] = count
print count, filename
a.writerows(data)
open the file in a (append) mode. On each iteration you reopen the file and write at the end instead of erasing it - this way uses more IO resources because of the open/close, and should be used only if the program can break and you want to be sure that all that was written before the crash has actually been saved to disk
for l in every_link_test:
content_1 = processURL_short(l)
for word in content_1.split():
word = word.strip(p)
if word in contractions:
count = count + 1
splitlink = l.split("/")
president = splitlink[4]
speech_num = splitlink[-1]
filename = "{0}_{1}".format(president,speech_num)
data[filename] = count
print count, filename
with open('contraction_counts.csv','a',newline='') as fp:
a = csv.writer(fp,delimiter = ',')
a.writerows(data)
Related
I have a multiple text files and I need to find and cound specific words in those files and write them in a csv file. Column A contains the txt file names and in the header the words and for each file name its count. With this code I am getting all the words and need to filter out exact words
for example the output should be like the image file I uploaded
header = ['Abuse', 'Accommodating', 'Accommodation', 'Accountability']
import csv
folderpaths = 'C:/Users/haris/Downloads/PDF/'
counter = Counter()
filepaths = glob(os.path.join(folderpaths,'*.txt'))
for file in filepaths:
with open(file) as f:
words = re.findall(r'\w+', f.read().lower())
counter = counter + Counter(words)
print(counter)
f = open('C:/Users/haris/Downloads/PDF/firstcsv.csv', 'w')
writer = csv.writer(f)
for row in counter.items():
writer.writerow(row)
Files uploaded to google drive
Edit: As per your new request, I have added the "total_words" column. The code has been updated.
Below is a code that works. Just change the "folderpath" variable to the path of the folder with the text files, and change the "target_file" variable to where you want the output csv file to be created.
Sample csv output:
Code:
from collections import Counter
import glob
import os
import re
header = ['annual', 'investment', 'statement', 'range' , 'deposit' , 'supercalifragilisticexpialidocious']
folderpath = r'C:\Users\USERname4\Desktop\myfolder'
target_file = r'C:\Users\USERname4\Desktop\mycsv.csv'
queueWAP = []
def writeAndPrint(fileObject,toBeWAP,opCode=0):
global queueWAP
if (opCode == 0):
fileObject.write(toBeWAP)
print(toBeWAP)
if (opCode == 1):
queueWAP.append(toBeWAP)
if (opCode == 2):
for temp4 in range(len(queueWAP)):
fileObject.write(queueWAP[temp4])
print(queueWAP[temp4])
queueWAP = []
mycsvfile = open(target_file, 'w')
writeAndPrint(mycsvfile,"file_name,total_words")
for temp1 in header:
writeAndPrint(mycsvfile,","+temp1)
writeAndPrint(mycsvfile,"\n")
filepaths = glob.glob(folderpath + r"\*.txt")
for file in filepaths:
with open(file) as f:
writeAndPrint(mycsvfile,file.split("\\")[-1])
counter = Counter()
words = re.findall(r'\w+', f.read().lower())
counter = counter + Counter(words)
for temp2 in header:
temp3 = False
temp5 = 0
for myword in counter.items():
temp5 = temp5 + 1
if myword[0] == temp2:
writeAndPrint(mycsvfile,","+str(myword[1]),1)
temp3 = True
if temp3 == False:
writeAndPrint(mycsvfile,","+"0",1)
writeAndPrint(mycsvfile,","+str(temp5))
writeAndPrint(mycsvfile,"",2)
writeAndPrint(mycsvfile,"\n")
mycsvfile.close()
Using 'Counter' seems to be the right choice here, but I think you are using it wrong.
Here is a possible solution that may work for you:
words = ['Abuse', 'Accommodating', 'Accommodation', 'Accountability']
rows = []
for file in filepaths:
with open(file, 'r') as f:
words_in_file = [word for line in f for word in line.split()]
# this will count all the words in the file (not optimal)
wordcounts = Counter(words_in_file)
# interested only in specific words
counts = list(map(lambda x: wordcounts[x], words))
# insert first column (filenam)
counts.insert(0, file)
# append it to the rest of the rows
rows.append(counts)
f = open('C:/Users/haris/Downloads/PDF/firstcsv.csv', 'w')
writer = csv.writer(f)
for row in rows:
writer.writerow(row)
I am working on a text file right now that is called "dracula.txt", and I have to do the following in python:
Save words that occur no more than 3 times in descending order in a file called less_common_words.txt. Each word with its count should be saved on a separate line.
I would appreciate any help! I've been working on this for too long.
I have already tokenized my file and counted the words. This is my code so far:
file = open("C:/Users/17733/Downloads/dracula.txt", 'r', encoding = 'utf-8-sig')
data = file.read()
data
data_list = data.split('\n')
data_list
new_list = []
for i in data_list:
if i !='':
ans_here = i.split(' ')
new_list.extend(ans_here)
new_list
import string
import re
puncs = list(string.punctuation)
puncs.append('"')
puncs.append('[')
puncs.append('.')
puncs.append('-')
puncs.append('_')
#append each seperately
new_2 = []
for i in new_list:
for p in puncs:
if p in i:
i_new = i.replace(p, ' ')
new_2.append(i_new)
new_2
new_2 = [i.replace(' ', ' ').strip().lower() for i in new_2]
new_2
from pathlib import Path
from collections import Counter
import string
filepath = Path('test.txt')
output_filepath = Path('outfile.txt')
# print(filepath.exists())
with open(filepath) as f:
content = f.readlines()
word_list = sum((
(s.lower().strip('\n').translate(str.maketrans('', '', string.punctuation))).split(' ')
for s in content
), [])
less_common_words = sorted([
key for key, value in Counter(word_list).items() if value <= 3
],reverse=True)
with open(output_filepath, mode='wt', encoding='utf-8') as myfile:
myfile.write('\n'.join(less_common_words))
This should exactly be what you need- I fixed my previous error by flattening the entire txt into a 2d list:
book_open = open('frankenstein.txt', 'r').readlines()
beauty_book = [i.split() for i in book_open]
flatten = []
for sublist in beauty_book:
for val in sublist:
flatten.append(val)
foo = 0
for i in flatten:
list_open = open('less_common_words.txt', 'r').readlines()
beauty_list = [i.replace('\n', '') for i in list_open]
count = flatten.count(flatten[foo])
compile = str((flatten[foo], count))
if count <= 3:
if compile not in beauty_list:
file = open('less_common_words.txt', 'a+')
file.write('\n'+compile)
file.close()
foo += 1
New to Python and I'm trying to count the words in a directory of text files and write the output to a separate text file. However, I want to specify conditions. So if word count is > 0 is would like to write the count and file path to one file and if the count is == 0. I would like to write the count and file path to a separate file. Below is my code so far. I think I'm close, but I'm hung up on how to do the conditions and separate files. Thanks.
import sys
import os
from collections import Counter
import glob
stdoutOrigin=sys.stdout
sys.stdout = open("log.txt", "w")
def count_words_in_dir(dirpath, words, action=None):
for filepath in glob.iglob(os.path.join("path", '*.txt')):
with open(filepath) as f:
data = f.read()
for key,val in words.items():
#print("key is " + key + "\n")
ct = data.count(key)
words[key] = ct
if action:
action(filepath, words)
def print_summary(filepath, words):
for key,val in sorted(words.items()):
print(filepath)
if val > 0:
print('{0}:\t{1}'.format(
key,
val))
filepath = sys.argv[1]
keys = ["x", "y"]
words = dict.fromkeys(keys,0)
count_words_in_dir(filepath, words, action=print_summary)
sys.stdout.close()
sys.stdout=stdoutOrigin
I would strongly urge you to not repurpose stdout for writing data to a file as part of the normal course of your program. I also wonder how you can ever have a word "count < 0". I assume you meant "count == 0".
The main problem that your code has is in this line:
for filepath in glob.iglob(os.path.join("path", '*.txt')):
The string constant "path" I'm pretty sure doesn't belong there. I think you want filepath there instead. I would think that this problem would prevent your code from working at all.
Here's a version of your code where I fixed these issues and added the logic to write to two different output files based on the count:
import sys
import os
import glob
out1 = open("/tmp/so/seen.txt", "w")
out2 = open("/tmp/so/missing.txt", "w")
def count_words_in_dir(dirpath, words, action=None):
for filepath in glob.iglob(os.path.join(dirpath, '*.txt')):
with open(filepath) as f:
data = f.read()
for key, val in words.items():
# print("key is " + key + "\n")
ct = data.count(key)
words[key] = ct
if action:
action(filepath, words)
def print_summary(filepath, words):
for key, val in sorted(words.items()):
whichout = out1 if val > 0 else out2
print(filepath, file=whichout)
print('{0}: {1}'.format(key, val), file=whichout)
filepath = sys.argv[1]
keys = ["country", "friend", "turnip"]
words = dict.fromkeys(keys, 0)
count_words_in_dir(filepath, words, action=print_summary)
out1.close()
out2.close()
Result:
file seen.txt:
/Users/steve/tmp/so/dir/data2.txt
friend: 1
/Users/steve/tmp/so/dir/data.txt
country: 2
/Users/steve/tmp/so/dir/data.txt
friend: 1
file missing.txt:
/Users/steve/tmp/so/dir/data2.txt
country: 0
/Users/steve/tmp/so/dir/data2.txt
turnip: 0
/Users/steve/tmp/so/dir/data.txt
turnip: 0
(excuse me for using some search words that were a bit more interesting than yours)
Hello I hope I understood your question correctly, this code will count how many different words are in your file and depending on the conditions will do something you want.
import os
all_words = {}
def count(file_path):
with open(file_path, "r") as f:
# for better performance it is a good idea to go line by line through file
for line in f:
# singles out all the words, by splitting string around spaces
words = line.split(" ")
# and checks if word already exists in all_words dictionary...
for word in words:
try:
# ...if it does increment number of repetitions
all_words[word.replace(",", "").replace(".", "").lower()] += 1
except Exception:
# ...if it doesn't create it and give it number of repetitions 1
all_words[word.replace(",", "").replace(".", "").lower()] = 1
if __name__ == '__main__':
# for every text file in your current directory count how many words it has
for file in os.listdir("."):
if file.endswith(".txt"):
all_words = {}
count(file)
n = len(all_words)
# depending on the number of words do something
if n > 0:
with open("count1.txt", "a") as f:
f.write(file + "\n" + str(n) + "\n")
else:
with open("count2.txt", "a") as f:
f.write(file + "\n" + str(n) + "\n")
if you want to count same word multiple times you can add up all values from dictionary or you can eliminate try-except block and count every word there.
I am trying to search multiple text files for the text "1-2","2-3","3-H" which occur in the last field of the lines of text that start with "play".
An example of the text file is show below
id,ARI201803290
version,2
info,visteam,COL
info,hometeam,ARI
info,site,PHO01
play,1,0,lemad001,22,CFBBX,HR/78/F
play,1,0,arenn001,20,BBX,S7/L+
play,1,0,stort001,12,SBCFC,K
play,1,0,gonzc001,02,SS>S,K
play,1,1,perad001,32,BTBBCX,S9/G
play,1,1,polla001,02,CSX,S7/L+.1-2
play,1,1,goldp001,32,SBFBBB,W.2-3;1-2
play,1,1,lambj001,00,X,D9/F+.3-H;2-H;1-3
play,1,1,avila001,31,BC*BBX,31/G.3-H;2-3
play,2,0,grayj003,12,CC*BS,K
play,2,1,dysoj001,31,BBCBX,43/G
play,2,1,corbp001,31,CBBBX,43/G
play,4,1,avila001,02,SC1>X,S8/L.1-2
For the text file above, I would like the output to be '4' since there are 4 occurrences of "1-2","2-3" and "3-H" in total.
The code I have got so far is below, however I'm not sure where to start with writing a line of code to do this function.
import os
input_folder = 'files' # path of folder containing the multiple text files
# create a list with file names
data_files = [os.path.join(input_folder, file) for file in
os.listdir(input_folder)]
# open csv file for writing
csv = open('myoutput.csv', 'w')
def write_to_csv(line):
print(line)
csv.write(line)
j=0 # initialise as 0
count_of_plate_appearances=0 # initialise as 0
for file in data_files:
with open(file, 'r') as f: # use context manager to open files
for line in f:
lines = f.readlines()
i=0
while i < len(lines):
temp_array = lines[i].rstrip().split(",")
if temp_array[0] == "id":
j=0
count_of_plate_appearances=0
game_id = temp_array[1]
awayteam = lines[i+2].rstrip().split(",")[2]
hometeam = lines[i+3].rstrip().split(",")[2]
date = lines[i+5].rstrip().split(",")[2]
for j in range(i+46,i+120,1): #only check for plate appearances this when temp_array[0] == "id"
temp_array2 = lines[j].rstrip().split(",") #create new array to check for plate apperances
if temp_array2[0] == "play" and temp_array2[2] == "1": # plate apperance occurs when these are true
count_of_plate_appearances=count_of_plate_appearances+1
#print(count_of_plate_appearances)
output_for_csv2=(game_id,date,hometeam, awayteam,str(count_of_plate_appearances))
print(output_for_csv2)
csv.write(','.join(output_for_csv2) + '\n')
i=i+1
else:
i=i+1
j=0
count_of_plate_appearances=0
#quit()
csv.close()
Any suggestions on how I can do this? Thanks in advance!
You can use regex, I put your text in a file called file.txt.
import re
a = ['1-2', '2-3', '3-H'] # What you want to count
find_this = re.compile('|'.join(a)) # Make search string
count = 0
with open('file.txt', 'r') as f:
for line in f.readlines():
count += len(find_this.findall(line)) # Each findall returns the list of things found
print(count) # 7
or a shorter solution: (Credit to wjandrea for hinting the use of a generator)
import re
a = ['1-2', '2-3', '3-H'] # What you want to count
find_this = re.compile('|'.join(a)) # Make search string
with open('file.txt', 'r') as f:
count = sum(len(find_this.findall(line)) for line in f)
print(count) # 7
How can I find only words that are unique to a text file? If a word is used frequently by in other files then it gets dropped.
Here is a reference http://sahandsaba.com/visualizing-philosophers-and-scientists-by-the-words-they-used-with-d3js-and-python.html
I need a script which loops through all text files in a folder and outputs the results in Json format.
My code so far :
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from pprint import pprint as pp
from glob import glob
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
import codecs
import jinja2
import json
import os
def get_raw_data():
texts = []
for x in range(1,95):
file_name = str(x+1)+".txt"
with codecs.open(file_name,"rU","utf-8") as myfile:
data = myfile.read()
texts.append(data)
yield file_name, '\n'.join(texts)
class StemTokenizer(object):
def __init__(self):
self.ignore_set = {'footnote'}
def __call__(self, doc):
words = []
for word in word_tokenize(doc):
word = word.lower()
w = wn.morphy(word)
if w and len(w) > 1 and w not in self.ignore_set:
words.append(w)
return words
def process_text(counts, vectorizer, text, file_name, index):
result = {w: counts[index][vectorizer.vocabulary_.get(w)]
for w in vectorizer.get_feature_names()}
result = {w: c for w, c in result.iteritems() if c > 4}
normalizing_factor = max(c for c in result.itervalues())
result = {w: c / normalizing_factor
for w, c in result.iteritems()}
return result
def main():
data = list(get_raw_data())
print('Data loaded')
n = len(data)
vectorizer = CountVectorizer(stop_words='english', min_df=(n-1) / n,tokenizer=StemTokenizer())
counts = vectorizer.fit_transform(text for p, text in data).toarray()
print('Vectorization done.')
print (counts)
for x in range(95):
file_name = str(x+1)+".txt"
# print (text)
for i, (text) in enumerate(data):
print (file_name)
# print (text)
with codecs.open(file_name,"rU","utf-8") as myfile:
text = myfile.read()
result = process_text(counts, vectorizer, text, file_name, i)
print (result)
if __name__ == '__main__':
main()
Looks like you've got a bunch of files named 1.txt, 2.txt, ... 95.txt, and you want to find words that occur in one file only. I'd just gather all words, counting how many files each one occurs in; and print out the singletons.
from collections import Counter
import re
fileids = [ str(n+1)+".txt" for n in range(95) ]
filecounts = Counter()
for fname in fileids:
with open(fname) as fp: # Add encoding if really needed
text = fp.read().lower()
words = re.split(r"\W+", text) # Keep letters, drop the rest
filecounts.update(set(words))
singletons = [ word in filecounts if filecounts[word] == 1 ]
print(" ".join(singletons))
Done. You don't need scikit, you don't need the nltk, you don't need a pile of IR algorithms. You can use the list of singletons in an IR algorithm, but that's a different story.
def parseText():
# oFile: text file to test
# myWord: word we are looking for
# Get all lines into list
aLines = oFile.readlines()
# Perform list comprehension on lines to test if the word is found
for sLine in aLines:
# Parse the line (remove spaces), returns list
aLine = sLine.split()
# Iterate words and test to see if they match our word
for sWord in aLines:
# if it matches, append it to our list
if sWord == myWord: aWords.append( sWord )
# Create empty list to store all instances of the word that we may find
aWords = []
# Prompt user to know what word to search
myWord = str( raw_input( 'what word to searh:' ) )
# Call function
parseText()
# Check if list has at least one element
if len( aWords ) < 1: print 'Word not found in file'
else: print str( len( aWords ) ) + ' instances of our word found in file'