Split a large text file to small ones based on location - python

Suppose I have a big file as file.txt and it has data of around 300,000. I want to split it based on certain key location. See file.txt below:
Line 1: U0001;POUNDS;**CAN**;1234
Line 2: U0001;POUNDS;**USA**;1234
Line 3: U0001;POUNDS;**CAN**;1234
Line 100000; U0001;POUNDS;**CAN**;1234
The locations are limited to 10-15 different nation. And I need to separate each record of a particular country in one particular file. How to do this task in Python
Thanks for help

This will run with very low memory overhead as it writes each line as it reads it.
Algorithm:
open input file
read a line from input file
get country from line
if new country then open file for country
write the line to country's file
loop if more lines
close files
Code:
with open('file.txt', 'r') as infile:
try:
outfiles = {}
for line in infile:
country = line.split(';')[2].strip('*')
if country not in outfiles:
outfiles[country] = open(country + '.txt', 'w')
outfiles[country].write(line)
finally:
for outfile in outfiles.values():
outfile.close()

with open("file.txt") as f:
content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
text = [x.strip() for x in content]
x = [i.split(";") for i in text]
x.sort(key=lambda x: x[2])
from itertools import groupby
from operator get itemgetter
y = groupby(x, itemgetter(2))
res = [(i[0],[j for j in i[1]]) for i in y]
for country in res:
with open(country[0]+".txt","w") as writeFile:
writeFile.writelines("%s\n" % ';'.join(l) for l in country[1])
will group by your item!
Hope it helps!

Looks like what you have is a csv file. csv stands for comma-separated values, but any file that uses a different delimiter (in this case a semicolon ;) can be treated like a csv file.
We'll use the python module csv to read the file in, and then write a file for each country
import csv
from collections import defaultdict
d = defaultdict(list)
with open('file.txt', 'rb') as f:
r = csv.reader(f, delimiter=';')
for line in r:
d[l[2]].append(l)
for country in d:
with open('{}.txt'.format(country), 'wb') as outfile:
w = csv.writer(outfile, delimiter=';')
for line in d[country]:
w.writerow(line)

# the formatting-function for the filename used for saving
outputFileName = "{}.txt".format
# alternative:
##import time
##outputFileName = lambda loc: "{}_{}.txt".format(loc, time.asciitime())
#make a dictionary indexed by location, the contained item is new content of the file for the location
sortedByLocation = {}
f = open("file.txt", "r")
#iterate each line and look at the column for the location
for l in f.readlines():
line = l.split(';')
#the third field (indices begin with 0) is the location-abbreviation
# make the string lower, cause on some filesystems the file with upper chars gets overwritten with only the elements with lower characters, while python differs between the upper and lower
location = line[2].lower().strip()
#get previous lines of the location and store it back
tmp = sortedByLocation.get(location, "")
sortedByLocation[location]=tmp+l.strip()+'\n'
f.close()
#save file for each location
for location, text in sortedByLocation.items():
with open(outputFileName(location) as f:
f.write(text)

Related

How to transform a list into a CSV file with N items per row?

I want to create a new CSV file with 3 items per row.
My source file looks like (there are no new lines / line breaks):
12123, 1324, 232324, 243443, 234, 2345, 2334, 2445, 22355, 222234, 2345
Now I want to transform this file in a CSV file. Take the first three elements and put it in the first row, new line, and take the next three items, etc...
12123, 1324, 232324
24343, 234, 2345
...
How can I do that with Python 3.x? I'm new in Python and don't get it...
My previous attempt:
import csv
with open('test.csv') as f:
reader = csv.reader(f)
with open('test2.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
liste = list(reader)
print(liste[1:2])
But my list object has only one long item.
You mentioned:
My source file looks like (there are no new lines / line breaks):
12123, 1324, 232324, 243443, 234, 2345 2334, 2445, 22355, 222234, 2345
So this reads one long row of a CSV, then writes it as groups of three per line:
import csv
with open('test.csv',newline='') as f:
reader = csv.reader(f)
line = next(reader) # Read the one long line
with open('test2.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
for i in range(0,len(line),3): # step by threes.
writer.writerow(line[i:i+3])
Note that correct use of the csv module requires the files to be opened with newline='' in Python 3 ('rb' or 'wb' in Python 2).
File I/O neutral solution:
csv = """12123, 1324, 232324, 243443, 234, 2345
2334, 2445, 22355, 222234, 2345""" # replace this with the file you read from CSV
def sixPerLineToThreePerLine(s):
result = ""
for line in s.split("\n"):
sp = line.split(", ")
result = result + ", ".join(sp[:3]) + "\n" + ", ".join(sp[3:])
return result
print(sixPerLineToThreePerLine(csv)) # replace this with code to write to CSV
Here's a solution but it's a bit long. Basically I would write all the values in the csv to a list, then remove three value from the list and write to the csv until there's no values left.
import csv
# just an example csv
with open('example.csv', 'w') as csvfile:
# create example csv with a single row of numbers 0-19
spamwriter = csv.writer(csvfile)
spamwriter.writerow([i for i in range(20)])
# open file for reading, append values to list
l = []
with open('example.csv') as csvfile:
# read the example file into a list
reader = csv.reader(csvfile)
for row in reader:
for val in row:
l.append(val)
# write to the original file with 3 values per line
with open('example.csv', 'w') as csvfile:
spamwriter = csv.writer(csvfile)
while l:
try:
# write to file 3 values at a time
spamwriter.writerow(l[:3])
l = l[3:]
except:
# add last bit of file, if file doesn't devide evenly by 3
spamwriter.writerow(l)
break
I'd recommend checking out Pandas
I find it a lot easier to manipulate csvs with it, but it's not in the standard library.
This should help. This was written using python 2.7 so if you have any problems running it in 3.x let me know and I can try to help.
import csv # import the csv module you will need, if you want to avoid this you can just read it in as a text file
output = """""" # make an output string
num = 0 #initialize num that trakcs how many numbers have ben read
with open('datacsv.csv', 'rb') as f: # open the input file
file = csv.reader(f) # initialize the file as being a csv file
for row in file: # for every row (you said no new lines, but just in case)
for data in row: # for ever entry in the row
if(num == 2): # if you have read in three numbers
num = 0 # reset num
output += data + "\n" # output a new line and the current number
else:
num += 1 # increment num
output += data + "," # add to output the data and a comma
new = open("outputcsv.csv", "w") # create the output file
new.write(output) # write the output data to the new file
I wrote a short program that I think does what you wanted:
It reads all lines from the reader file and then just insert them into the writer file 3 by 3 :)
import csv
def main():
with open('ex.csv', 'rb') as f:
reader = csv.reader(f)
with open('ex2.csv', 'wb') as csvfile:
writer = csv.writer(csvfile)
pass_on = []
for row in reader:
#print row
for c in xrange(0, len(row)): # passing on objects after count of 3
if row[c]:
pass_on.append(row[c])
print pass_on
while pass_on:
writer.writerow(pass_on[:3])
pass_on = pass_on[3:]
print "done"
if __name__ == '__main__':
main()
What you want to do is read in the data from the file, then split it into individual elements. Once you have it in individual elements you can put them in groups of three and write to your output file.
Something like this should work:
def read_data(file_path):
with open(file_path, 'r') as fh:
elements = fh.read()
data = [element.strip() for element in elements.split(',')]
return data
def group(data):
grouped = [', '.join(data[n:n+3]) for n in range(0, len(data), 3)]
return grouped
def write(data, output):
with open(output, 'w') as fh:
fh.writelines(data)
def main():
data = read('test.csv')
data = group(data)
write(data, 'test2.csv')
A four-line solution without the csv module:
with open('oneline_numbers.csv') as fobj_in, open('three_numbers.csv', 'w') as fobj_out:
numbers = iter(entry.strip() for entry in next((fobj_in)).split(','))
for line in zip(*[numbers] * 3):
fobj_out.write(', '.join(line) + '\n')

Python script to count num lines in all files in directory

So I'm new to python and I'm trying to write a script that iterates through all .txt files in a directory, counts the number of lines in each one (with exception to lines that are blank or commented out), and writes the final output to a csv. The final output should look something like this:
agprices, avi, adp
132, 5, 8
I'm having trouble with the syntax to save each count as the value of the dictionary. Here is my code below:
#!/usr/bin/env python
import csv
import copy
import os
import sys
#get current working dir, set count, and select file delimiter
d = os.getcwd()
count = 0
ext = '.txt'
#parses through files and saves to a dict
series_dict = {}
txt_files = [i for i in os.listdir(d) if os.path.splitext(i)[1] == ext]
#selects all files with .txt extension
for f in txt_files:
with open(os.path.join(d,f)) as file_obj:
series_dict[f] = file_obj.read()
if line.strip(): #Exclude blank lines
continue
else if line.startswith("#"): #Exclude commented lines
continue
else
count +=1
#Need to save count as val in dict here
#save the dictionary with key/val pairs to a csv
with open('seriescount.csv', 'wb') as f:
w = csv.DictWriter(f, series_dict.keys())
w.writeheader()
w.writerow(series_dict)
So here's the edit:
#!/usr/bin/env python
import csv
import copy
import os
import sys
import glob
#get current working dir, set count, and select file delimiter
os.chdir('/Users/Briana/Documents/Misc./PythonTest')
#parses through files and saves to a dict
series = {}
for fn in glob.glob('*.txt'):
with open(fn) as f:
series[fn] = (1 for line in f if line.strip() and not line.startswith('#'))
print series
#save the dictionary with key/val pairs to a csv
with open('seriescount.csv', 'wb') as f:
w = csv.DictWriter(f, series.keys())
sum(names.values())
I'm getting an indentation error on the 2nd to last line and am not quite sure why? Also, I'm not positive that I'm writing the syntax correctly on the last part. Again, I'm simply trying to return a dictionary with names of files and number of lines in files like {a: 132, b:245, c:13}
You can try something along these lines:
os.chdir(ur_directory)
names={}
for fn in glob.glob('*.txt'):
with open(fn) as f:
names[fn]=sum(1 for line in f if line.strip() and not line.startswith('#'))
print names
That will print a dictionary similar to:
{'test_text.txt': 20, 'f1.txt': 3, 'lines.txt': 101, 'foo.txt': 6, 'dat.txt': 6, 'hello.txt': 1, 'f2.txt': 4, 'neglob.txt': 8, 'bar.txt': 6, 'test_reg.txt': 6, 'mission_sp.txt': 71, 'test_nums.txt': 8, 'test.txt': 7, '2591.txt': 8303}
And you can use that Python dict in csv.DictWriter.
If you want the sum of those, just do:
sum(names.values())
I think you should make two changes to your script:
Use glob.glob() to get the list of files matching your desired suffix
Use for line in file_obj to iterate through the lines
Other problem:
The indentation is wrong on your last few lines
You could count your lines in your files with this 1-liner:
line_nums = sum(1 for line in open(f) if line.strip() and line[0] != '#')
that would shorten your code segment to
for f in txt_files:
count += sum(1 for line in open(os.path.join(d,f))
if line[0] != '#' and line.strip())
I looks like you want to use a dictionary to keep track of the counts. You could create one a the top like this counts = {}
Then (once you fix your tests) you can update it for each non-comment line:
series_dict = {}
txt_files = [i for i in os.listdir(d) if os.path.splitext(i)[1] == ext]
#selects all files with .txt extension
for f in txt_files:
counts[f] = 0 # create an entry in the dictionary to keep track of one file's lines
with open(os.path.join(d,f)) as file_obj:
series_dict[f] = file_obj.read()
if line.startswith("#"): #Exclude commented lines
continue
elif line.strip(): #Exclude blank lines
counts(f) += 1

index out of range python

I'm trying to write a very simple program using tuples. Which works for the most part but I can't really get it to work by accessing individual elements in the tuples.
I'm taking input from a file containing some info convert it to tuple and the store the data in some other file.
It works if I write all the data or just the first tuple but not in any other case. Following is the code
filename = "in.txt"
stock_market = []
for line in open(filename):
fields = line.split(",")
name = fields[0]
shares = int(fields[1])
stock = (name,shares)
portfolio.append(stock)
f = open("output.txt","w")
print >>f, portfolio[1]
f.close()
You can't append to portfolio without defining it first. Try something like this:
inFilename = "in.txt"
outFilename = "output.txt"
with open(inFilename, 'r') as inf:
with open(outFilename, 'w') as outf:
for line in inf:
fields = line.split(',')
print >>outf, (fields[0], fields[1])

Replace character in line inside a file

I have these different lines with values in a text file
sample1:1
sample2:1
sample3:0
sample4:15
sample5:500
and I want the number after the ":" to be updated sometimes
I know I can split the name by ":" and get a list with 2 values.
f = open("test.txt","r")
lines = f.readlines()
lineSplit = lines[0].split(":",1)
lineSplit[1] #this is the value I want to change
im not quite sure how to update the lineSplit[1] value with the write functions
You can use the fileinput module, if you're trying to modify the same file:
>>> strs = "sample4:15"
Take the advantage of sequence unpacking to store the results in variables after splitting.
>>> sample, value = strs.split(':')
>>> sample
'sample4'
>>> value
'15'
Code:
import fileinput
for line in fileinput.input(filename, inplace = True):
sample, value = line.split(':')
value = int(value) #convert value to int for calculation purpose
if some_condition:
# do some calculations on sample and value
# modify sample, value if required
#now the write the data(either modified or still the old one) to back to file
print "{}:{}".format(sample, value)
Strings are immutable, meaning, you can't assign new values inside them by index.
But you can split up the whole file into a list of lines, and change individual lines (strings) entirely. This is what you're doing in lineSplit[1] = A_NEW_INTEGER
with open(filename, 'r') as f:
lines = f.read().splitlines()
for i, line in enumerate(lines):
if condition:
lineSplit = line.split(':')
lineSplit[1] = new_integer
lines[i] = ':'.join(lineSplit)
with open(filename, 'w') as f:
f.write('\n'.join(lines)
Maybe something as such (assuming that each first element before : is indeed a key):
from collections import OrderedDict
with open('fin') as fin:
samples = OrderedDict(line.split(':', 1) for line in fin)
samples['sample3'] = 'something else'
with open('output') as fout:
lines = (':'.join(el) + '\n' for el in samples.iteritems())
fout.writelines(lines)
Another option is to use csv module (: is a column delimiter in your case).
Assuming there is a test.txt file with the following content:
sample1:1
sample2:1
sample3:0
sample4:15
sample5:500
And you need to increment each value. Here's how you can do it:
import csv
# read the file
with open('test.txt', 'r') as f:
reader = csv.reader(f, delimiter=":")
lines = [line for line in reader]
# write the file
with open('test.txt', 'w') as f:
writer = csv.writer(f, delimiter=":")
for line in lines:
# edit the data here
# e.g. increment each value
line[1] = int(line[1]) + 1
writer.writerows(lines)
The contents of test.txt now is:
sample1:2
sample2:2
sample3:1
sample4:16
sample5:501
But, anyway, fileinput sounds more logical to use in your case (editing the same file).
Hope that helps.

Read file from line 2 or skip header row

How can I skip the header row and start reading a file from line2?
with open(fname) as f:
next(f)
for line in f:
#do something
f = open(fname,'r')
lines = f.readlines()[1:]
f.close()
If you want the first line and then you want to perform some operation on file this code will helpful.
with open(filename , 'r') as f:
first_line = f.readline()
for line in f:
# Perform some operations
If slicing could work on iterators...
from itertools import islice
with open(fname) as f:
for line in islice(f, 1, None):
pass
f = open(fname).readlines()
firstLine = f.pop(0) #removes the first line
for line in f:
...
To generalize the task of reading multiple header lines and to improve readability I'd use method extraction. Suppose you wanted to tokenize the first three lines of coordinates.txt to use as header information.
Example
coordinates.txt
---------------
Name,Longitude,Latitude,Elevation, Comments
String, Decimal Deg., Decimal Deg., Meters, String
Euler's Town,7.58857,47.559537,0, "Blah"
Faneuil Hall,-71.054773,42.360217,0
Yellowstone National Park,-110.588455,44.427963,0
Then method extraction allows you to specify what you want to do with the header information (in this example we simply tokenize the header lines based on the comma and return it as a list but there's room to do much more).
def __readheader(filehandle, numberheaderlines=1):
"""Reads the specified number of lines and returns the comma-delimited
strings on each line as a list"""
for _ in range(numberheaderlines):
yield map(str.strip, filehandle.readline().strip().split(','))
with open('coordinates.txt', 'r') as rh:
# Single header line
#print next(__readheader(rh))
# Multiple header lines
for headerline in __readheader(rh, numberheaderlines=2):
print headerline # Or do other stuff with headerline tokens
Output
['Name', 'Longitude', 'Latitude', 'Elevation', 'Comments']
['String', 'Decimal Deg.', 'Decimal Deg.', 'Meters', 'String']
If coordinates.txt contains another headerline, simply change numberheaderlines. Best of all, it's clear what __readheader(rh, numberheaderlines=2) is doing and we avoid the ambiguity of having to figure out or comment on why author of the the accepted answer uses next() in his code.
If you want to read multiple CSV files starting from line 2, this works like a charm
for files in csv_file_list:
with open(files, 'r') as r:
next(r) #skip headers
rr = csv.reader(r)
for row in rr:
#do something
(this is part of Parfait's answer to a different question)
# Open a connection to the file
with open('world_dev_ind.csv') as file:
# Skip the column names
file.readline()
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Process only the first 1000 rows
for j in range(0, 1000):
# Split the current line into a list: line
line = file.readline().split(',')
# Get the value for the first column: first_col
first_col = line[0]
# If the column value is in the dict, increment its value
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
# Else, add to the dict and set value to 1
else:
counts_dict[first_col] = 1
# Print the resulting dictionary
print(counts_dict)

Categories

Resources