How can I go to the next line in .txt file? - python

How can I read only first symbol in each line with out reading all line, using python?
For example, if I have file like:
apple
pear
watermelon
In each iteration I must store only one (the first) letter of line.
Result of program should be ["a","p","w"], I tried to use file.seek(), but how can I move it to the new line?

ti7 answer is great, but if the lines might be too long to save in memory, you might wish to read char-by-char to prevent storing the whole line in memory:
from pathlib import Path
from typing import Iterator
NEWLINE_CHAR = {'\n', '\r'}
def first_chars(file_path: Path) -> Iterator[str]:
with open(file_path) as fh:
new_line = True
while c := fh.read(1):
if c in NEWLINE_CHAR:
new_line = True
elif new_line:
yield c
new_line = False
Test:
path = Path('/some/path/a.py')
easy_first_chars = [l[0] for l in path.read_text().splitlines() if l]
smart_first_chars = list(first_chars(path))
assert smart_first_chars == easy_first_chars

file-like objects are iterable, so you can directly use them like this
collection = []
with open("input.txt") as fh:
for line in fh: # iterate by-lines over file-like
try:
collection.append(line[0]) # get the first char in the line
except IndexError: # line has no chars
pass # consider other handling
# work with collection
You may also consider enumerate() if you cared about which line a particular value was on, or yielding line[0] to form a generator (which may allow a more efficient process if it can halt before reading the entire file)
def my_generator():
with open("input.txt") as fh:
for lineno, line in enumerate(fh, 1): # lines are commonly 1-indexed
try:
yield lineno, line[0] # first char in the line
except IndexError: # line has no chars
pass # consider other handling
for lineno, first_letter in my_generator():
# work with lineno and first_letter here and break when done

You can read one letter with file.read(1)
file = open(filepath, "r")
letters = []
# Initilalized to '\n' to sotre first letter
previous = '\n'
while True:
# Read only one letter
letter = file.read(1)
if letter == '':
break
elif previous == '\n':
# Store next letter after a next line '\n'
letters.append(letter)
previous = letter

Related

Reading specific words from an online source - Python

Here's the content of the text file abc.txt
This is before the start and should be ignored.
So should this
and this
*** START OF SYNTHETIC TEST CASE ***
a ba bac
*** END OF SYNTHETIC TEST CASE ***
This is after the end and should be ignored too.
Have a nice day.
I need to write a function, get_words_from_file(filename), that returns a list of lower case words as shown in the sample case below. The function should only process lines between the start and end marker lines and use the definition of words provided below.
I am provided with the following regular expression that describes what is required. I am not expected to understand how regular expressions work, I just need to understand that the call to findall given below will return a list of the relevant words from a given line string.
words_on_line = re.findall("[a-z]+[-'][a-z]+|[a-z]+[']?|[a-z]+", line)
.Include all lower-case character sequences including those that contain a
- or ' character and those that end with a ' character.
.Words that end with a - MUST NOT be included.
.The words should be in the same order as they occur in the file.
.There must be no more than 9 CONSTANTS declared.
.Functions must be no longer than 20 statements.
.Functions must not have more than 3 parameters.
Test Code:
filename = "abc.txt"
words2 = get_words_from_file(filename)
print(filename, "loaded ok.")
print("{} valid words found.".format(len(words2)))
print("Valid word list:")
print("\n".join(words2))
Expected Output:
abc.txt loaded ok.
3 valid words found.
Valid word list:
a
ba
bac
My Code is as follows:
def stripped_lines(lines):
for line in lines:
stripped_line = line.rstrip('\n')
yield stripped_line
def lines_from_file(fname):
with open(fname, 'rt', encoding='utf8') as flines:
for line in stripped_lines(flines):
yield line
def is_marker_line(line, start='***', end='***'):
'''
Marker lines start and end with the given strings, which may not
overlap. (A line containing just '***' is not a valid marker line.)
'''
min_len = len(start) + len(end)
if len(line) < min_len:
return False
return line.startswith(start) and line.endswith(end)
def advance_past_next_marker(lines):
'''
Advances the given iterator through the first encountered marker
line, if any.
'''
for line in lines:
if is_marker_line(line):
break
def lines_before_next_marker(lines):
'''
Yields all lines up to but not including the next marker line. If
no marker line is found, yields no lines.
'''
valid_lines = []
for line in lines:
if is_marker_line(line):
break
valid_lines.append(line)
else:
# `for` loop did not break, meaning there was no marker line.
valid_lines = []
for content_line in valid_lines:
yield content_line
def lines_between_markers(lines):
'''
Yields the lines between the first two marker lines.
'''
# Must use the iterator --- if it's merely an iterable (like a list
# of strings), the call to lines_before_next_marker will restart
# from the beginning.
it = iter(lines)
advance_past_next_marker(it)
for line in lines_before_next_marker(it):
yield line
def words(lines):
text = '\n'.join(lines).lower().split()
# Same as before...
def get_words_from_file(fname):
for word in words(lines_between_markers(lines_from_file(fname))):
return word
filename = "abc.txt"
words2 = get_words_from_file(filename)
print(filename, "loaded ok.")
print("{} valid words found.".format(len(words2)))
print("Valid word list:")
print("\n".join(words2))
My Crappy Output
Traceback (most recent call last):
File "C:/Users/Jill/SQ4.1(2).py", line 67, in <module>
words2 = get_words_from_file(filename)
File "C:/Users/Jason/SQ4.1(2).py", line 63, in <module>
for word in words(lines_between_markers(lines_from_file(fname))):
builtins.TypeError: 'NoneType' object is not iterable
Could you help me with the correcting my code? I am at a total loss.
I have changed the original code a bit, try below.
def stripped_lines(lines):
for line in lines:
stripped_line = line.rstrip('\n')
yield stripped_line
def lines_from_file(fname):
with open(fname, 'rt') as flines:
for line in stripped_lines(flines):
yield line
def is_marker_line(line, start='***', end='***'):
'''
Marker lines start and end with the given strings, which may not
overlap. (A line containing just '***' is not a valid marker line.)
'''
min_len = len(start) + len(end)
if len(line) < min_len:
return False
return line.startswith(start) and line.endswith(end)
def advance_past_next_marker(lines):
'''
Advances the given iterator through the first encountered marker
line, if any.
'''
for line in lines:
if is_marker_line(line):
break
def lines_before_next_marker(lines):
'''
Yields all lines up to but not including the next marker line. If
no marker line is found, yields no lines.
'''
valid_lines = []
for line in lines:
if is_marker_line(line):
break
valid_lines.append(line)
else:
# `for` loop did not break, meaning there was no marker line.
valid_lines = []
for content_line in valid_lines:
yield content_line
def lines_between_markers(lines):
'''
Yields the lines between the first two marker lines.
'''
# Must use the iterator --- if it's merely an iterable (like a list
# of strings), the call to lines_before_next_marker will restart
# from the beginning.
it = iter(lines)
advance_past_next_marker(it)
for line in lines_before_next_marker(it):
yield line
def words(lines):
text = '\n'.join(lines).lower().split()
return text
def get_words_from_file(fname):
return words(lines_between_markers(lines_from_file(fname)))
filename = "abc.txt"
all_words = get_words_from_file(filename)
print(filename, "loaded ok.")
print("{} valid words found.".format(len(all_words)))
print("Valid word list:")
print("\n".join(all_words))
Output will be below,
('abc.txt', 'loaded ok.')
3 valid words found.
Valid word list:
a
ba
bac

Why is Python only performing an action on the first iteration of a loop [duplicate]

In the following code, if I use:
for line in fin:
It only executes for 'a'
But if I use:
wordlist = fin.readlines()
for line in wordlist:
Then it executes for a thru z.
But readlines() reads the whole file at once, which I don't want.
How to avoid this?
def avoids():
alphabet = 'abcdefghijklmnopqrstuvwxyz'
num_words = {}
fin = open('words.txt')
for char in alphabet:
num_words[char] = 0
for line in fin:
not_found = True
word = line.strip()
if word.lower().find(char.lower()) != -1:
num_words[char] += 1
fin.close()
return num_words
the syntax for line in fin can only be used once. After you do that, you've exhausted the file and you can't read it again unless you "reset the file pointer" by fin.seek(0). Conversely, fin.readlines() will give you a list which you can iterate over and over again.
I think a simple refactor with Counter (python2.7+) could save you this headache:
from collections import Counter
with open('file') as fin:
result = Counter()
for line in fin:
result += Counter(set(line.strip().lower()))
which will count the number of words in your file (1 word per line) that contain a particular character (which is what your original code does I believe ... Please correct me if I'm wrong)
You could also do this easily with a defaultdict (python2.5+):
from collections import defaultdict
with open('file') as fin:
result = defaultdict(int)
for line in fin:
chars = set(line.strip().lower())
for c in chars:
result[c] += 1
And finally, kicking it old-school -- I don't even know when setdefault was introduced...:
fin = open('file')
result = dict()
for line in fin:
chars = set(line.strip().lower())
for c in chars:
result[c] = result.setdefault(c,0) + 1
fin.close()
You have three options:
Read in the whole file anyway.
Seek back to the beginning of the file before attempting to iterate over it again.
Rearchitect your code so that it doesn't need to iterate over the file more than once.
Try:
from collections import defaultdict
from itertools import product
def avoids():
alphabet = 'abcdefghijklmnopqrstuvwxyz'
num_words = defaultdict(int)
with open('words.txt') as fin:
words = [x.strip() for x in fin.readlines() if x.strip()]
for ch, word in product(alphabet, words):
if ch not in word:
continue
num_words[ch] += 1
return num_words

count suffixes appearing in the word file

I have got this python program which reads through a wordlist file and checks for the suffixes ending which are given in another file using endswith() method.
the suffixes to check for is saved into the list: suffixList[]
The count is being taken using suffixCount[]
The following is my code:
fd = open(filename, 'r')
print 'Suffixes: '
x = len(suffixList)
for line in fd:
for wordp in range(0,x):
if word.endswith(suffixList[wordp]):
suffixCount[wordp] = suffixCount[wordp]+1
for output in range(0,x):
print "%-6s %10i"%(prefixList[output], prefixCount[output])
fd.close()
The output is this :
Suffixes:
able 0
ible 0
ation 0
the program is unable to reach this loop :
if word.endswith(suffixList[wordp]):
You need to strip the newline:
word = ln.rstrip().lower()
The words are coming from a file so each line ends with a newline character. You are then trying to use endswith which always fails as none of your suffixes end with a newline.
I would also change the function to return the values you want:
def store_roots(start, end):
with open("rootsPrefixesSuffixes.txt") as fs:
lst = [line.split()[0] for line in map(str.strip, fs)
if '#' not in line and line]
return lst, dict.fromkeys(lst[start:end], 0)
lst, sfx_dict = store_roots(22, 30) # List, SuffixList
Then slice from the end and see if the substring is in the dict:
with open('longWordList.txt') as fd:
print('Suffixes: ')
mx, mn = max(sfx_dict, key=len), min(sfx_dict, key=len)
for ln in map(str.rstrip, fd):
suf = ln[-mx:]
for i in range(mx-1, mn-1, -1):
if suf in sfx_dict:
sfx_dict[suf] += 1
suf = suf[-i:]
for k,v in sfx_dict:
print("Suffix = {} Count = {}".format(k,v))
Slicing the end of the string incrementally should be faster than checking every string especially if you have numerous suffixes that are the same length. At most it does mx - mn iterations, so if you had 20 four character suffixes you would only need to check the dict once, only one n length substring can be matched at a time so we would kill n length substrings at the one time with a single slice and lookup.
You could use a Counter to count the occurrences of suffix:
from collections import Counter
with open("rootsPrefixesSuffixes.txt") as fp:
List = [line.strip() for line in fp if line and '#' not in line]
suffixes = List[22:30] # ?
with open('longWordList.txt') as fp:
c = Counter(s for word in fp for s in suffixes if word.rstrip().lower().endswith(s))
print(c)
Note: add .split()[0] if there are more than one words per line you want to ignore, otherwise this is unnecessary.

Reading a file by word without using split in Python

I have a one line file that I want to read word by word, i.e., with space separating words. Is there a way to do this without loading the data into the memory and using split? The file is too large.
You can read the file char by char and yield a word after each new white space, below is a simple solution for a file with single white spaces, you should refine it for complex cases (tabs, multiple spaces, etc).
def read_words(filename):
with open(filename) as f:
out = ''
while True:
c = f.read(1)
if not c:
break
elif c == ' ':
yield out
out = ''
else:
out += c
Example:
for i in read_words("test"):
print i
It uses a generator to avoid have to allocate a big chunk of memory
Try this little function:
def readword(file):
c = ''
word = ''
while c != ' ' and c != '\n':
word += c
c = file.read(1)
return word
Then to use it, you can do something like:
f = open('file.ext', 'r')
print(readword(f))
This will read the first word in the file, so if your file is like this:
12 22 word x yy
another word
...
then the output should be 12.
Next time you call this function, it will read the next word, and so on...

Compare 2 files in Python

I am trying to compare two files, A and C, in Python and for some reason the double for loop doesn't seem to work properly:
with open(locationA + filenameC,'r') as fileC, open(locationA + filenameA,'r') as fileA:
for lineC in fileC:
fieldC = lineC.split('#')
for lineA in fileA:
fieldA = lineA.split('#')
print 'UserID Clicks' + fieldC[0]
print 'UserID Activities' + fieldA[0]
if (fieldC[0] == fieldA[0]) and (fieldC[2] == fieldA[2]):
print 'OK'
Here, only the line of C seems to be compared, but for the other lines, the "A loop" seems to be ignored.
Can anyone help me with this?
Your problem is that once you iterate over fileA once you need to change the pointer to the beginning of the file again.
So what you might do is create two lists from both files and iterate over them as many times as you want. For example:
fileC_list = fileC.readlines()
fileA_list = fileA.readlines()
for lineC in fileC_list:
# do something
for lineA in fileA_list:
# do somethins
The problem with nested loops (from the point of view of your current problem) is precisely that the inner loop runs to completion for each iteration of the outer loop. So instead, set lineA by calling for the next item from the fileA iterator explicitly:
with open(locationA + filenameC,'r') as fileC, open(locationA + filenameA,'r') as fileA:
for lineC in fileC:
fieldC = lineC.split('#')
lineA = next(fileA)
fieldA = lineA.split('#')
print 'UserID Clicks' + fieldC[0]
print 'UserID Activities' + fieldA[0]
if (fieldC[0] == fieldA[0]) and (fieldC[2] == fieldA[2]):
print 'OK'
This logic will ignore any extra lines from fileA once fileC is exhausted, and if fileC contains more lines than FileA things might also get ugly without special checks.
A different approach might use itertools.izip() to collect lines from each file in pairs:
import itertools
with open(locationA + filenameC,'r') as fileC, open(locationA + filenameA,'r') as fileA:
for lineC, lineA in itertools.izip(fileC, fileA):
fieldC = lineC.split('#')
fieldA = lineA.split('#')
print 'UserID Clicks' + fieldC[0]
print 'UserID Activities' + fieldA[0]
if (fieldC[0] == fieldA[0]) and (fieldC[2] == fieldA[2]):
print 'OK'
I can't think of any specific reason to use one instead of the other, but if the files are of any size at all refuse the temptation to use the builtin zip() function instead of itertools.izip() - the former returns a list, and so memory usage depends on file sizes, whereas the latter is a generator, and so creates values as they are required.
You are comparing all lines from FileA to each line from FileC. That means, for each line of File C, you will read the entire FileA, and (provided you do move the pointer to the beginning of the File A), you would read it again, and again.
It is easier to read them both at the same time while they both have lines
if they are the same, do something, read from both
if they are different, read from the smallest (Line A < Line C, read from File A only; Line C < Line A, read from Line C only)
and make two last loops while there are remaining lines (two loops, one for each file, as you do not know which one ran out of lines)
I know this is an old thread but it comes up on google when someone is looking for a solution to compare 2 text files in python.
This code worked for me.
You can update the codes and use "with open" instead and fine tune as you like but it does the job.
# Ask the user to enter the names of files to compare
fname1 = input("Enter the first filename (text1.txt): ")
fname2 = input("Enter the second filename (text1.txt): ")
# Open file for reading in text mode (default mode)
f1 = open(fname1)
f2 = open(fname2)
# Print confirmation
print("-----------------------------------")
print("Comparing files ", " > " + fname1, " < " +fname2, sep='\n')
print("-----------------------------------")
# Read the first line from the files
f1_line = f1.readline()
f2_line = f2.readline()
# Initialize counter for line number
line_no = 1
# Loop if either file1 or file2 has not reached EOF
while f1_line != '' or f2_line != '':
# Strip the leading whitespaces
f1_line = f1_line.rstrip()
f2_line = f2_line.rstrip()
# Compare the lines from both file
if f1_line != f2_line:
# If a line does not exist on file2 then mark the output with + sign
if f2_line == '' and f1_line != '':
print(">+", "Line-%d" % line_no, f1_line)
# otherwise output the line on file1 and mark it with > sign
elif f1_line != '':
print(">", "Line-%d" % line_no, f1_line)
# If a line does not exist on file1 then mark the output with + sign
if f1_line == '' and f2_line != '':
print("<+", "Line-%d" % line_no, f2_line)
# otherwise output the line on file2 and mark it with < sign
elif f2_line != '':
print("<", "Line-%d" % line_no, f2_line)
# Print a blank line
print()
#Read the next line from the file
f1_line = f1.readline()
f2_line = f2.readline()
#Increment line counter
line_no += 1
# Close the files
f1.close()
f2.close()

Categories

Resources