Read only the numbers from a txt file python - python

I have a text file that contains these some words and a number written with a point in it. For example
hello!
54.123
Now I only want the number 54.123 to be extracted an converted so that the outcome is 54123
The code I tried is
import re
exp = re.compile(r'^[\+]?[0-9]')
my_list = []
with open('file.txt') as f:
lines = f.readlines()
for line in lines:
if re.match(exp, line.strip()):
my_list.append(int(line.strip()))
#convert to a string
listToStr = ' '.join([str(elem) for elem in my_list])
print(listToStr)
But this returns the error: ValueError: invalid literal for int() with base 10: '54.123'
Does anyone know a solution for this?

You can try to convert the current line to a float. In case the line does not contain a legit float number it returns a ValueError exception that you can catch and just pass. If no exception is thrown just split the line at the dot, join the 2 parts, convert to int and add to the array.
my_list = []
with open('file.txt') as f:
lines = f.readlines()
for line in lines:
try:
tmp = float(line)
num = int(''.join(line.split(".")))
my_list.append(num)
except ValueError:
pass
#convert to a string
listToStr = ' '.join([str(elem) for elem in my_list])
print(listToStr)

You can check if a given line is a string representing a number using the isdigit() function.
From what I can tell you need to just check if there is a number as isdigit() works on integers only (floats contain "." which isn't a number and it returns False).
For example:
def numCheck(string):
# Checks if the input string contains numbers
return any(i.isdigit() for i in string)
string = '54.123'
print(numCheck(string)) # True
string = 'hello'
print(numCheck(string)) # False
Note: if your data contains things like 123ab56 then this won't be good for you.
To convert 54.123 to 54123 you could use the replace(old, new) function.
For example:
string = 54.123
new_string = string.replace('.', '') # replace . with nothing
print(new_string) # 54123

This may help I am now getting numbers from the file I guess you were trying to use split in place of strip
import re
exp = re.compile(r'[0-9]')
my_list = []
with open('file.txt') as f:
lines = f.readlines()
for line in lines:
for numbers in line.split():
if re.match(exp, numbers):
my_list.append(numbers)
#convert to a string
listToStr = ' '.join([str(elem) for elem in my_list])
print(listToStr)

Related

How do I fix this ValueError?

I am trying to get an average from a text file that uses a def function. I am trying to convert the list from the text file to int(). Instead of converting it gives me the error: " ValueError: invalid literal for int() with base 10: '5, 5, 6, 7' ". The "5, 5, 6, 7" is one that I made from the proper .txt file. Here is the code:
def getNumberList(filename):
with open(filename,'r') as f:
lyst = f.read().split('\n')
numberList = [int(num) for num in lyst]
return numberList
def getAverage(filename, func):
numbers = func(filename)
return sum(numbers)/len(numbers)
def main():
filename = input("Input the file name: ")
average = getAverage(filename, getNumberList)
print(average)
if __name__ == "__main__":
main()
You are splitting by line but you are not splitting by commas, so you are trying to convert 5,5,6,7 to an integer, which is impossible. You need to also split by commas after you split by line, and then combine them into one list, if you want to average all the numbers in the file. The following should work:
def getNumberList(filename):
with open(filename,'r') as f:
lines = f.readlines()
numberList = [int(num) for num in line.split(',') for line in lines]
return numberList
Looks like you might need to split each element with lyst using "," because right now it is trying to convert each line which has "1,2,3" as input.
So, change this and try.
def getNumberList(filename):
with open(filename,'r') as f:
lyst = []
temp = f.read().strip().split('\n')
for i in temp:
lyst += i.strip().split(',')
numberList = [int(num) for num in lyst]
return numberList

how to check integer and string in a list

My list is ['1','2','to','3']
I need to write a logic that
convert '1' '2' which is string to 1, 2 which is integer
print an error msg since 'to' string is included and cannot be converted to integer
Here's the code I have right now:
def average_file(filename):
inputFile = open(filename, "r")
inList = []
results = []
n = []
for line in inputFile:
inList.append(line.strip()) #inList.append(line)
n = [int(elem) for elem in inList if elem.isdigit()] #I only remove the string and leave integer in my list, but I need a check logic to print error msg
results = list(map(int, n))
inputFile.close()
results = sum(results)/len(results)
return results
Few things:
The pythonic way to do it is to expect it to be an all digit value and handle the error when it is not.
You can use with to handle your file lifetime.
You can calculate sum and count of elements during the reading without saving additional array (and therefore know the average).
strip is redundant when parsing to int like that int(variable):
There you go:
def average_file(filename):
summary = 0
count = 0
with open(filename, "r") as inputFile:
for line in inputFile:
try:
summary += int(line)
count += 1
except ValueError as e:
print('Can not parse "{0}" to a number'.format(line))
# If reached here one of the values in the file is not a number and None is returned immediately
return None
# If count is 0 return None, otherwise return the average
return (summary / count) if count else None
The answer was edited after some clarifications from OP:
Immediately return None when one of the values is not a number.
convert '1' '2' which is string to 1, 2 which is integer print an
error msg since 'to' string is included and cannot be converted to
integer
source = ['1', '2', 'to', '3']
result = []
for item in source:
try:
result.append(int(item))
except ValueError as ex:
print('Not integer: {}'.format(item))
print(result)
Attempt to convert each item to the list of results. If the conversion fails, print an error message.
l = ['1','2','to','3']
result = []
for item in l:
try:
result.append(int(item))
except ValueError:
print(item)
You can use a try/except block to separate the valid integer literals from everything else:
candidates = ['1','2','to','3']
for candidate in candidates:
try: # attempt the conversion
value = int(candidate)
except ValueError: # conversion failed!
print(candidate, 'is not an integer')
else: # conversion succeeded
print(candidate, 'is the integer', value)
In your case, you can just collect the values in the else clause:
results = []
with open(filename) as input_file:
for line in inputFile:
try:
value = int(line.strip())
except ValueError:
print(line.strip(), 'is not an integer')
else:
results.append(value)
l = ['1', '2', 'word', '4']
You can do:
n = [int(i) if i.isdigit() else print('\nNot able to convert: '+i) for i in l]
Output:
Not able to convert: word
l = [1, 2, None, 4]

Python search for patterns in all lines, export only lines with results

I would like to search for strings that match a pattern in a text file and export only the matched strings
k=''
regex = re.compile(r'[a-zA-Z]{2}\d{8}')
with open(file, 'r') as f:
for line in f:
line = line.replace(',', '')
line = line.replace('.', '')
k = regex.findall(line)
#k.append(line)
if not k=='':
position=True
else:
position=False
if position==True:
print(k)
Somehow my code doesn't work, it always returns the following output:
[] [] [] [] [] [] [] ['AI13933231'] [] [] [] [] []
I want the output to contain only the matched strings. Thank you!
The reason why there are empty array literals [] is because this line actually exists, but is either empty (containing just \n) or does not match the regex '[a-zA-Z]{2}\d{8}'. And please note that regex.findall(line) returns an list, so if the regex did not find any that matches, it is an empty list.
Your main error happened in this section: if not k=='':. Note k is an list.
Consider this code:
import re
k=''
regex = re.compile(r'[a-zA-Z]{2}\d{8}')
with open("omg.txt", 'r') as f:
for line in f:
line = line.replace(',', '')
line = line.replace('.', '')
k = regex.findall(line)
#k.append(line)
position = False
if str(k) != '[]': # The `[]` is just the string representation of an empty array
position=True
print(k)
else:
position=False
Given the file (Text after # are ignored, not part of the file)
AZ23153133
# Empty line
AB12355342
gz # No match
XY93312344
The output would be
['AZ23153133']
['AB12355342']
['XY93312344']

Python regex string match from file

I have this a text file that resembles
alpha alphabet alphameric
I would like to match just the first string `alpha', nothing else
I have the following code that attempts to match just the alpha string and get its line number
findWord = re.findall('\\ba\\b', "alpha")
with open(file) as myFile:
for num, line in enumerate(myFile, 1):
if findWord in line:
print 'Found at line: ', num
However I get the following error:
TypeError: 'in ' requires string as left operand, not list
Issues in your code
re.findall('\\ba\\b', "alpha") gives a matched list but you are using in if findWord in line means using list in place of string . That's what the error you are getting
By giving findWord = re.findall('\\ba\\b', "alpha") you are searching for string a in alpha string which is not existing
Try this
import re
#findWord = re.findall('\\ba\\b', "alpha")
#print findWord
with open("data.txt") as myFile:
for num,line in enumerate(myFile):
if re.findall('\\balpha\\b', line):
print 'Found at line: ', num+1
You may modify your code a bit
with open(file, 'r') as myFile:
for num, line in enumerate(myFile, 1):
if 'alpha' in line.split():
print 'Found at line', num
Output:
Found at line 1
You can try this:
import re
s = "alpha alphabet alphameric"
data = re.findall("alpha(?=\s)", s)[0]
Output:
"alpha"

Python error with lambda and alphanumeric words

I am really new to python and now I am having an error and do not know why I get this error.
I have 3 lists with words. The lists contains words numeric, literal words and alphanumeric words. These lists are saved in an txt file. Each file can contain words from other lists or new words.
Now I like to compare these lists and copy all words without duplicates in to the new list. So I have one big list, containing all words but no duplicates.
This is my script:
file_a = raw_input("File 1?: ")
file_b = raw_input("File 2?: ")
file_c = raw_input("File_3?: ")
file_new = raw_input("Neue Datei: ")
def compare_files():
with open(file_a, 'r') as a:
with open(file_b, 'r') as b:
with open(file_c, 'r') as c:
with open(file_new, 'w') as new:
difference = set(a).symmetric_difference(b).symmetric_difference(c)
difference.discard('\n')
sortiert = sorted(difference, key=lambda item: (int(item.partition(' ')[0])
if item[0].isdigit() else float('inf'), item))
for line in sortiert:
new.write(line)
k = compare_files()
When I run the script I get the following error message:
Traceback (most recent call last):
File "TestProject1.py", line 19, in <module>
k = compare_files()
File "TestProject1.py", line 13, in compare_files
sortiert = sorted(difference, key=lambda item: (int(item.partition(' ')[0])
File "TestProject1.py", line 14, in <lambda>
if item[0].isdigit() else float('inf'), item))
ValueError: invalid literal for int() with base 10: '12234thl\n'
Anyone an idea or something what is wrong in my script?
Thank you for your help :)
partition on ' ' or any other string for that matter will not extract the numeral part of the string except you know the character immediately following the numeral; very unlikely.
You can instead use a regular expression to extract the leading numeral part of the string:
import re
p = re.compile(r'^\d+')
def compare_files():
with open(file_a, 'r') as a, open(file_b, 'r') as b, \
open(file_c, 'r') as c, open(file_new, 'w') as new:
difference = set(a).symmetric_difference(b).symmetric_difference(c)
difference.discard('\n')
sortiert = sorted(difference,
key=lambda item: (int(p.match(item).group(0)) \
if item[0].isdigit() \
else float('inf'), item))
for line in sortiert:
new.write(line)
The pattern '^\d+' should match all numerals from the start of the string and then p.match(item).group(0) returns the numeral as a string which can then casted to integer.

Categories

Resources