Memory Error while running python script on 4GB file - python

I am trying to count number of words that has length between 1 and 5, file size is around 4GB end I am getting memory error.
import os
files = os.listdir('C:/Users/rram/Desktop/')
for file_name in files:
file_path = "C:/Users/rram/Desktop/"+file_name
f = open (file_path, 'r')
text = f.readlines()
update_text = ''
wordcount = {}
for line in text:
arr = line.split("|")
word = arr[13]
if 1<=len(word)<6:
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
update_text+= '|'.join(arr)
print (wordcount) #print update_text
print 'closing', file_path, '\t', 'total files' , '\n\n'
f.close()
At the end i get a MemoryError on this line text = f.readlines()
Can you pelase help to optimize it.

As suggested in the comments you should read the file line by line and not the entire file.
For example :
count = 0
with open('words.txt','r') as f:
for line in f:
for word in line.split():
if(1 <= len(word) <=5):
count=count+1
print(count)
EDIT :
If you only want to count the words in 14-th column and split by | instead then :
count = 0
with open('words.txt','r') as f:
for line in f:
iterator = 0
for word in line.split("|"):
if(1 <= len(word) <=5 and iterator == 13):
count=count+1
iterator = iterator +1
print(count)
note that you should avoid to write this
arr = line.split("|")
word = arr[13]
since the line may contains less than 14 words, which can result in a segmentation error.

Related

Finding longest word in a txt file

I am trying to create a function in which a filename is taken as a parameter and the function returns the longest word in the file with the line number attached to the front of it.
This is what I have so far but it is not producing the expected output I need.
def word_finder(file_name):
with open(file_name) as f:
lines = f.readlines()
line_num = 0
longest_word = None
for line in lines:
line = line.strip()
if len(line) == 0:
return None
else:
line_num += 1
tokens = line.split()
for token in tokens:
if longest_word is None or len(token) > len(longest_word):
longest_word = token
return (str(line_num) + ": " + str(longest_word))
I think this is the shortest way to find the word, correct if not
def wordFinder(filename):
with open(filename, "r") as f:
words = f.read().split() # split() returns a list with the words in the file
longestWord = max(words, key = len) # key = len returns the word size
print(longestWord) # prints the longest word
Issue
Exactly what ewong diagnosed:
last return statement is too deep indented
Currently:
the longest word in the first line only
Solution
Should be aligned with the loop's column, to be executed after the loop.
def word_finder(file_name):
with open(file_name) as f:
lines = f.readlines()
line_num = 0
longest_word = None
for line in lines:
line = line.strip()
if len(line) == 0:
return None
else:
line_num += 1
tokens = line.split()
for token in tokens:
if longest_word is None or len(token) > len(longest_word):
longest_word = token
# return here would exit the loop too early after 1st line
# loop ended
return (str(line_num) + ": " + str(longest_word))
Then:
the longest word in the file with the line number attached to the front of it.
Improved
def word_finder(file_name):
with open(file_name) as f:
line_word_longest = None # global max: tuple of (line-index, longest_word)
for i, line in enumerate(f): # line-index and line-content
line = line.strip()
if len(line) > 0: # split words only if line present
max_token = max(token for token in line.split(), key = len) # generator then max of tokens by length
if line_word_longest is None or len(max_token) > len(line_word_longest[1]):
line_word_longest = (i, max_token)
# loop ended
if line_word_longest is None:
return "No longest word found!"
return f"{line_word_longest[0]}: '{line_word_longest[1]}' ({len(line_word_longest[1])} chars)"
See also:
Basic python file-io variables with enumerate
List Comprehensions in Python to compute minimum and maximum values of a list
Some SO research for similar questions:
inspiration from all languages: longest word in file
only python: [python] longest word in file
non python: -[python] longest word in file

Open file and count how many times a word is in file

I want to open a file with a given word. The function will read the file line by line and return a count of how many lines contain the given word.
def count_word(file_name, word):
with open(file_name, 'r') as file:
line = file.readline()
line.rstrip('\n')
cnt = 0
for line in file:
if word in line:
cnt += 1
return cnt
This is what I've tried, but it's not working correctly. Not sure what's going on.
Try this:
def count_word(file_name, word):
with open(file_name, 'r') as file:
content = file.read()
return content.count(word)
You need to count the occurrences of the word in isolation. For example, as is in classic, but the word as does not appear in the sentence this is a classic problem. Additionally, you need to move your return to outside the for-loop:
def wordCount(infilepath, word):
answer = 0
with open(infilepath) as infile:
for line in infilepath:
answer += line.split().count(word)
return answer
Here is an alternative version using collections.Counter and re.split:
from collections import Counter
import re
def count_word(file_name, word):
return Counter(re.split('\s+|\W+', open(file_name).read())).get(word, 0)
This should do it, first, it loads the file, then looks through it and counts up all of the words in each line and returning them
#function to count words
def count_word(file_name, word):
#hold number of words
cnt = 0
#open file
with open(file_name, 'r') as file:
#get lines
lines = file.readlines()
#loop the thrpugh file
for line in lines:
#strip the line
line.rstrip('\n')
#get how many times it appears in the line
cnt += line.lower().split().count(word)
return cnt
print(count_word("test.txt", "test"))

read line by `for` loop and rewrtie, creates more rows by spliting them by comma

I re-upload question after some editing.
This file consists of one column and 81,021 rows.
What I am trying to do is read each row then rewrite files.
After reading each row, I want to count the number of letters, number of special characters, and white spaces and each row.
First, here is my code to read, count number of letters, and rewrite.
file = "C:/" # File I want to read
with open("C:/",'w',encoding='cp949',newline='') as testfile: # New file
csv_writer=csv.writer(testfile)
with open(file,'r') as fi:
for each in fi:
file=each
linecount=count_letters(file)
lst=[file]+[linecount]
csv_writer.writerow(lst)
The problem here is that number of rows increased from 81021 to 86000. Records that have , were separated into multiple rows. Here's how I edited.
input_fileName = ""
output_fileName = ""
f = open(input_fileName, 'r')
out_list = []
buf = ''
flg = 0
for line in f:
if line.count('"')%2 == 1:
if flg == 0: flg = 1
else: flg = 0
if flg == 1: buf += line.strip(' \n')
elif flg == 0 and len(buf) > 0:
buf += line.strip(' \n')
buf = buf.strip(' "')
out_list.append([buf,len(buf)])
buf = ''
else:
line = line.strip(' \n')
out_list.append([line,len(line)])
f.close()
of = open(output_fileName, 'w')
for each in out_list:
print(each[0]+','+str(each[1]), file=of)
of.close()
In this case, a number of rows are not changed.
But now those files creates more columns and records are now separated into multiple columns instead of rows.
How should I fix this problem?
I can't delete , in my file since some rows have ,
That one where it says Nationality caused an error. There were both Korean and English written in one cell. There was a line between those two words.
국적
Nationality
성별
합계
And now it turned into 4 rows and there are quotation marks.
"국적
Nationality"
성별
합계

How can I count the number of times each word appears in a txt file?

Here is the assignment in detail:
Write a complete python program which reads words from a file called trash.dat. Assume there is one word per line in the file. Output the count for each of the different words (case insensitive) in the file. For example a file with:
dog
Dog
cat
rat
Would output:
dog=2
cat=1
rat=1
You should go off and do your own homework to help you learn. But regardless, here is a solution.
#!/usr/bin/env python
dict = {}
with open("trash.dat", "rw") as f:
for line in f:
if line != "\n":
if line.lower() in dict:
dict[line.lower()] = dict[line.lower()] + 1
else:
dict[line.lower()] = 1
for x in dict:
print "%s=" % x, dict[x]
#!python2
from collections import Counter
words = []
# load words to list
with open('trash.dat', 'r') as fp:
for line in fp:
if line != '\n':
words.append(line.lower().rstrip())
# make a dictionary from the 'words' list
cnt = Counter(words)
# print out the key, value pairs
for k, v in cnt.items():
print 'the count of ' + k + ' is: ' + str(v)
'''
# output
the count of rat is: 1
the count of dog is: 2
the count of cat is: 1
''''
This might help:
theWords = []
with open('fileName.dat', 'r') as file:
for every line in file:
theWords.append(line.lower().rstrip("\n"))
print("Rat = " + theWords[1])
print("Dog = " + theWords[2])
print("Cat = " + theWords[3])
Every line in your file will be seperated.

How to open a file and find the longest length of a line and then print it out

Here's is what I have done so far but the length function isn't working.
import string
def main():
print " This program reads from a file and then prints out the"
print " line with the longest length the line ,or with the highest sum"
print " of ASCII values , or the line with the greatest number of words"
infile = open("30075165.txt","r")
for line in infile:
print line
infile.close()
def length():
maxlength = 0
infile = open("30075165.txt","r")
for line in infile:
linelength = lengthofline
if linelength > maxlength:
#If linelength is greater than maxlength value the new value is linelength
maxlength = linelength
linelength = line
print ,maxlinetext
infile.close()
For Python 2.5 to 2.7.12
print max(open(your_filename, 'r'), key=len)
For Python 3 and up
print(max(open(your_filename, 'r'), key=len))
large_line = ''
large_line_len = 0
filename = r"C:\tmp\TestFile.txt"
with open(filename, 'r') as f:
for line in f:
if len(line) > large_line_len:
large_line_len = len(line)
large_line = line
print large_line
output:
This Should Be Largest Line
And as a function:
def get_longest_line(filename):
large_line = ''
large_line_len = 0
with open(filename, 'r') as f:
for line in f:
if len(line) > large_line_len:
large_line_len = len(line)
large_line = line
return large_line
print get_longest_line(r"C:\tmp\TestFile.txt")
Here is another way, you would need to wrap this in a try/catch for various problems (empty file, etc).
def get_longest_line(filename):
mydict = {}
for line in open(filename, 'r'):
mydict[len(line)] = line
return mydict[sorted(mydict)[-1]]
You also need to decide that happens when you have two 'winning' lines with equal length? Pick first or last? The former function will return the first, the latter will return the last.
File contains
Small Line
Small Line
Another Small Line
This Should Be Largest Line
Small Line
Update
The comment in your original post:
print " This program reads from a file and then prints out the"
print " line with the longest length the line ,or with the highest sum"
print " of ASCII values , or the line with the greatest number of words"
Makes me think you are going to scan the file for length of lines, then for ascii sum, then
for number of words. It would probably be better to read the file once and then extract what data you need from the findings.
def get_file_data(filename):
def ascii_sum(line):
return sum([ord(x) for x in line])
def word_count(line):
return len(line.split(None))
filedata = [(line, len(line), ascii_sum(line), word_count(line))
for line in open(filename, 'r')]
return filedata
This function will return a list of each line of the file in the format: line, line_length, line_ascii_sum, line_word_count
This can be used as so:
afile = r"C:\Tmp\TestFile.txt"
for line, line_len, ascii_sum, word_count in get_file_data(afile):
print 'Line: %s, Len: %d, Sum: %d, WordCount: %d' % (
line.strip(), line_len, ascii_sum, word_count)
to output:
Line: Small Line, Len: 11, Sum: 939, WordCount: 2
Line: Small Line, Len: 11, Sum: 939, WordCount: 2
Line: Another Small Line, Len: 19, Sum: 1692, WordCount: 3
Line: This Should Be Largest Line, Len: 28, Sum: 2450, WordCount: 5
Line: Small Line, Len: 11, Sum: 939, WordCount: 2
You can mix this with Steef's solution like so:
>>> afile = r"C:\Tmp\TestFile.txt"
>>> file_data = get_file_data(afile)
>>> max(file_data, key=lambda line: line[1]) # Longest Line
('This Should Be Largest Line\n', 28, 2450, 5)
>>> max(file_data, key=lambda line: line[2]) # Largest ASCII sum
('This Should Be Largest Line\n', 28, 2450, 5)
>>> max(file_data, key=lambda line: line[3]) # Most Words
('This Should Be Largest Line\n', 28, 2450, 5)
Try this:
def main():
print " This program reads from a file and then prints out the"
print " line with the longest length the line ,or with the highest sum"
print " of ASCII values , or the line with the greatest number of words"
length()
def length():
maxlength = 0
maxlinetext = ""
infile = open("30075165.txt","r")
for line in infile:
linelength = len(line)
if linelength > maxlength:
#If linelength is greater than maxlength value the new value is linelength
maxlength = linelength
maxlinetext = line
print maxlinetext
infile.close()
EDIT: Added main() function.
linelength = lengthofline # bug?
It should be:
linelength = len(line) # fix
Python might not be the right tool for this job.
$ awk 'length() > n { n = length(); x = $0 } END { print x }' 30075165.txt
My solution (also works in Python 2.5):
import os.path
def getLongestLineFromFile(fileName):
longestLine = ""
if not os.path.exists(fileName):
raise "File not found"
file = open(fileName, "r")
for line in file:
if len(line) > len(longestLine):
longestLine = line
return longestLine
if __name__ == "__main__":
print getLongestLineFromFile("input.data")
Example "input.data" contents:
111111111
1111111111111111111111
111111111
22222222222222222
4444444444444444444444444444444
444444444444444
5555

Categories

Resources