python text reading - python

datafile = open("temp.txt", "r")
record = datafile.readline()
while record != '':
d1 = datafile.strip("\n").split(",")
print d1[0],float (d1[1])
record = datafile.readline()
datafile.close()
The temp file contains
a,12.7
b,13.7
c,18.12
I can't get output. Please help.

The correct code should be:
with open('temp.txt') as f:
for line in f:
after_split = line.strip("\n").split(",")
print after_split[0], float(after_split[1])
The main reason you're not getting output in your code is that datafile doesn't have a strip() method, and I'm surprised you're not getting exceptions.
I highly suggest you read the Python tutorial - it looks like you're trying to write Python in another language and that is not A Good Thing

You want to call strip and split on the line, not the file.
Replace
d1 = datafile.strip("\n").split(",")
With
d1 = record.strip("\n").split(",")

you operating with file handler, but should work on line
like this d1 = record.strip("\n").split(",")
datafile = open("temp.txt", "r")
record = datafile.readline()
while record != '':
d1 = record.strip("\n").split(",")
print d1[0],float (d1[1])
record = datafile.readline()
datafile.close()

Perhaps the following will work better for you (comments as explanation):
# open file this way so that it automatically closes upon any errors
with open("temp.txt", "r") as f:
data = f.readlines()
for line in data:
# only process non-empty lines
if line.strip():
d1 = line.strip("\n").split(",")
print d1[0], float(d1[1])

Related

Reading and writing a csv file Python

I just started learning Python, and I am trying to do the following:
- Read a .csv file
- Write the filtered data in a new file where the column 7 is not blank/empty
When I am printing my results, it shows the right output in the python shelf, but when I am checking my data in the .csv is no correct (differs from what is showing with the print function)
Any suggestion with my code?
Thank you in advance.
file = open("station.csv", "r")
writeFile = open("stations-filtered.csv", "w")
for line in file:
line2 = line.split(",")
if line2[7] != "":
print(line)
writeFile.write(line)
I agree with #user513093 that you can use csv, like:
file = open("station.csv", "r")
writeFile = open("stations-filtered.csv", "w")
writer = csv.writer(writeFile, delimiter=',')
for line in file:
line2 = line.split(",")
if line2[7] != "":
print(line)
writer.writerow(line)
But still, pandas is good:
import pandas as pd
file = pd.read_csv("station.csv", sep=",", header=None)
file = file[file[7] != ""]
file.to_csv("stations-filtered.csv")

Text in a file replaced by regex. How to write the change into file?

I can change the text in a file line by line, but I don't know how to write the results (changes) into the file.
This is a small part of my file:
<name>2016-09-15_obere-firstalm_gehen-6,5km</name>
<extensions>
<line xmlns="http://www.topografix.com/GPX/gpx_style/0/2">
<color>000000</color>
</line>
</extensions>
<trkseg>
<trkpt lat="47.671193" lon="11.886518">
<ele>1115.6</ele>
<time>2016-09-15T11:57:44Z</time>
</trkpt>
<trkpt lat="47.670686" lon="11.886412">
<ele>1117.6</ele>
<time>2016-09-15T11:58:14Z</time>
</trkpt>
<trkpt lat="47.670821" lon="11.886459">
<ele>1055.6</ele>
<time>2016-09-15T11:58:44Z</time>
</trkpt>
With a Python script I change values of elevations by adding 30.
Example:
Before change:
elevation (ele) 1115.6,
after change:
elevation (ele) 1145.6
#This little Python adds 30 to elevation:
import re
f1 = raw_input("name of your GPX file: ")
f1 = open(f1,'r+')
for line in f1:
res = re.search(r"<(ele)>(.+)</\1>",line)
if res:
number=float(res.group(2))
number_elev=number+30
number_elev=str(number_elev)
ress = re.sub(r"<(ele)>(.+)", r"\2",number_elev)
#print shows correct new values between <ele> and </ele>
print ress + "\n"
###but how to write into the gpx file these changes?
f1.close()
print "OK"
Expected: Write the file with the changed lines.
Actual: I don't know how to write a change by regexp into the file.
Thx in advance for your help.
You'll first need to read all the lines in your file and write each one to the file replacing the ones matching the regex search.
Also since one more <ele>..</ele> tag sequences can be on the same line, you'll need to find all occurrences of those in the line and replace them accordingly.
import re
f1 = raw_input("name of your GPX file: ")
with open(f1,'r') as f:
lines = f.readlines()
with open(f1, 'w') as f:
for line in lines:
ress = line
res = re.findall(r"<(ele)>(.+)</\1>",ress)
if res:
for r in res:
number=float(r[1])
number_elev=number+30
number_elev=str(number_elev)
ress=re.sub(r"<(ele)>{}</(ele)>".format(r[1]), r"<ele>{}</ele>".format(number_elev),string=ress, count=1)
f.write(ress)
Don't try to read and write from/to the same file at the same time. Just create and output file and write to it.
The following code is untested but it should work.
import re
f1 = input("name of your GPX file: ")
input_file = open(f1,'r+')
output_file = open(f1 + '_output', 'w+')
for line in input_file:
res = re.search(r"<(ele)>(.+)</\1>", line)
if res:
number=float(res.group(2))
number_elev=number+30
number_elev=str(number_elev)
line = line.replace(res.group(2), number_elev)
output_file.write(line)
input_file.close()
output_file.close()
print("OK")
You can read the file all at once and apply the regex to the data and write out the modified data to another file as follows:
import re
with open('input-file.xml') as fd:
data = fd.read()
regex = re.compile('(<ele>)([\d.]+)(</ele>)')
while True:
match = regex.search(data)
if not match:
break
new_value = float(match.group(2)) + 30
# <ele>6373.8</ele> becomes </ele>6373.8<ele> so that it doesnt match again
data = regex.sub(r'\g<3>{}\g<1>'.format(new_value), data, count=1)
# undo </ele>...<ele> the tag reversal done inside the while loop
regex = re.compile('(</ele>)([\d.]+)(<ele>)')
data = regex.sub(r'\3\2\1', data)
with open('output-file.xml', 'w') as fd:
fd.write(data)

Large text file to csv, can't open text file

I'm trying to convert this 3,1 GB text file from https://snap.stanford.edu/data/
into a csv file. All the data is structured like:
name: something
age: something
gender: something
which makes it a pretty large text file with some million lines.
I have tried to write a py script to convert it but for some reason it won't read the lines in my for each loop.
Here is the code:
import csv
def trycast(x):
try:
return float(x)
except:
try:
return int(x)
except:
return x
cols = ['product_productId', 'review_userId', 'review_profileName', 'review_helpfulness', 'review_score', 'review_time', 'review_summary', 'review_text']
f = open("movies.txt", "wb")
w = csv.writer(f)
w.writerow(cols)
doc = {}
with open('movies.txt') as infile:
for line in infile:
line = line.strip()
if line=="":
w.writerow([doc.get(col) for col in cols])
doc = {}
else:
idx = line.find(":")
key, value = tuple([line[:idx], line[idx+1:]])
key = key.strip().replace("/", "_").lower()
value = value.strip()
doc[key] = trycast(value)
f.close()
I'm not sure if it is because the document is to large, because a regulare notepad program won't be able to open it.
Thanks up front! :-)
In the line f = open("movies.txt", "wb") you're opening the file for writing, and thereby deleting all its content. Later on, you're trying to read from that same file. It probably works fine if you change the output filename. (I am not going to download 3.1 GB to test it. ;) )

Python search a file for text using input from another file

I'm new to python and programming. I need some help with a python script. There are two files each containing email addresses (more than 5000 lines). Input file contains email addresses that I want to search in the data file(also contains email addresses). Then I want to print the output to a file or display on the console. I search for scripts and was able to modify but I'm not getting the desired results. Can you please help me?
dfile1 (50K lines)
yyy#aaa.com
xxx#aaa.com
zzz#aaa.com
ifile1 (10K lines)
ccc#aaa.com
vvv#aaa.com
xxx#aaa.com
zzz#aaa.com
Output file
xxx#aaa.com
zzz#aaa.com
datafile = 'C:\\Python27\\scripts\\dfile1.txt'
inputfile = 'C:\\Python27\\scripts\\ifile1.txt'
with open(inputfile, 'r') as f:
names = f.readlines()
outputlist = []
with open(datafile, 'r') as fd:
for line in fd:
name = fd.readline()
if name[1:-1] in names:
outputlist.append(line)
else:
print "Nothing found"
print outputlist
New Code
with open(inputfile, 'r') as f:
names = f.readlines()
outputlist = []
with open(datafile, 'r') as f:
for line in f:
name = f.readlines()
if name in names:
outputlist.append(line)
else:
print "Nothing found"
print outputlist
Maybe I'm missing something, but why not use a pair of sets?
#!/usr/local/cpython-3.3/bin/python
data_filename = 'dfile1.txt'
input_filename = 'ifile1.txt'
with open(input_filename, 'r') as input_file:
input_addresses = set(email_address.rstrip() for email_address in input_file.readlines())
with open(data_filename, 'r') as data_file:
data_addresses = set(email_address.rstrip() for email_address in data_file.readlines())
print(input_addresses.intersection(data_addresses))
mitan8 gives the problem you have, but this is what I would do instead:
with open(inputfile, "r") as f:
names = set(i.strip() for i in f)
output = []
with open(datafile, "r") as f:
for name in f:
if name.strip() in names:
print name
This avoids reading the larger datafile into memory.
If you want to write to an output file, you could do this for the second with statement:
with open(datafile, "r") as i, open(outputfile, "w") as o:
for name in i:
if name.strip() in names:
o.write(name)
Here's what I would do:
names=[]
outputList=[]
with open(inputfile) as f:
for line in f:
names.append(line.rstrip("\n")
myEmails=set(names)
with open(outputfile) as fd, open("emails.txt", "w") as output:
for line in fd:
for name in names:
c=line.rstrip("\n")
if name in myEmails:
print name #for console
output.write(name) #for writing to file
I think your issue stems from the following:
name = fd.readline()
if name[1:-1] in names:
name[1:-1] slices each email address so that you skip the first and last characters. While it might be good in general to skip the last character (a newline '\n'), when you load the name database in the "dfile"
with open(inputfile, 'r') as f:
names = f.readlines()
you are including newlines. So, don't slice the names in the "ifile" at all, i.e.
if name in names:
I think you can remove name = fd.readline() since you've already got the line in the for loop. It'll read another line in addition to the for loop, which reads one line every time. Also, I think name[1:-1] should be name, since you don't want to strip the first and last character when searching. with automatically closes the files opened.
PS: How I'd do it:
with open("dfile1") as dfile, open("ifile") as ifile:
lines = "\n".join(set(dfile.read().splitlines()) & set(ifile.read().splitlines())
print(lines)
with open("ofile", "w") as ofile:
ofile.write(lines)
In the above solution, basically I'm taking the union (elements part of both sets) of the lines of both the files to find the common lines.

Searching and extracting WH-word from a file line by line with Python and regex

I have a file that has one sentence per line. I am trying to read the file and search if the sentence is a question using regex and extract the wh-word from the sentences and save them back into another file according the order it appeared in the first file.
This is what I have so far..
def whWordExtractor(inputFile):
try:
openFileObject = open(inputFile, "r")
try:
whPattern = re.compile(r'(.*)who|what|how|where|when|why|which|whom|whose(\.*)', re.IGNORECASE)
with openFileObject as infile:
for line in infile:
whWord = whPattern.search(line)
print whWord
# Save the whWord extracted from inputFile into another whWord.txt file
# writeFileObject = open('whWord.txt','a')
# if not whWord:
# writeFileObject.write('None' + '\n')
# else:
# whQuestion = whWord
# writeFileObject.write(whQuestion+ '\n')
finally:
print 'Done. All WH-word extracted.'
openFileObject.close()
except IOError:
pass
The result after running the code above: set([])
Is there something I am doing wrong here? I would be grateful if someone can point it out to me.
Something like this:
def whWordExtractor(inputFile):
try:
with open(inputFile) as f1:
whPattern = re.compile(r'(.*)who|what|how|where|when|why|which|whom|whose(\.*)', re.IGNORECASE)
with open('whWord.txt','a') as f2: #open file only once, to reduce I/O operations
for line in f1:
whWord = whPattern.search(line)
print whWord
if not whWord:
f2.write('None' + '\n')
else:
#As re.search returns a sre.SRE_Match object not string, so you will have to use either
# whWord.group() or better use whPattern.findall(line)
whQuestion = whWord.group()
f2.write(whQuestion+ '\n')
print 'Done. All WH-word extracted.'
except IOError:
pass
Not sure if it's what you're looking for, but you could try something like this:
def whWordExtractor(inputFile):
try:
whPattern = re.compile(r'who|what|how|where|when|why|which|whom|whose', re.IGNORECASE)
with open(inputFile, "r") as infile:
for line in infile:
whMatch = whPattern.search(line)
if whMatch:
whWord = whMatch.group()
print whWord
# save to file
else:
# no match
except IOError:
pass
Change '(.*)who|what|how|where|when|why|which|whom|whose(\.*)' to
".*(?:who|what|how|where|when|why|which|whom|whose).*\."

Categories

Resources