How to split a large file into chunks using python - python

import csv
divisor = 1000000
outfileno = 1
outfile = None
with open('o77069882.txt', 'r') as infile:
infile_iter = csv.reader(infile)
header = next(infile_iter)
for index, row in enumerate(infile_iter):
if index % divisor == 0:
if outfile is not None:
outfile.close()
outfilename = 'big-{}.csv'.format(outfileno)
outfile = open(outfilename, 'w')
outfileno += 1
writer = csv.writer(outfile)
writer.writerow(header)
writer.writerow(row)
# Don't forget to close the last file
if outfile is not None:
outfile.close()
I am using this above code to divide my file into chunks of size 100000, it does the job but each row in the first file is getting enclosed by quotes( " ") as for example "abc, dfg, ghj, kil" . The second and third file created are not having this problem, can anyone help me modify my code to get rid of the above error.

Related

How to read txt.file and find sum of different products with different prices

I have a following text-file products.txt:
Product;Amount;Price
Apple;3;10.00
Banana;1;5.00
Lemon;2;3.00
Orange;4;20.00
Apple;4;8.00
I want read this file and make a new text-file newfile.txt, which contains value of each row (Amount X Price):
30.00
5.00
6.00
80.00
32.00
Finally, I want to find the total sum of newfile.txt (which is 30+5+6+80+32 = 153)
Note, the price of same product can vary and we are not interested total sum of each product.
I started with creating class.
class DATA:
product= ""
amount= 0
price= 0
def read (name):
list = []
file= open(name, 'r', encoding="UTF-8")
file.readline()
while (True):
row= file.readline()
if(rivi == ''):
break
columns= row[:-1].split(';')
info= DATA()
info.amount= int(columns[1])
info.price= int(columns[2])
info.total = info.amount * info.price
file.append(info)
tiedosto.close()
return list
This should work:
def read(name):
total = 0
ori = open(name, 'r', encoding="UTF-8")
row = ori.readline()
dest = open("newfile.txt", 'w', encoding="UTF-8")
row = ori.readline()
while (row != ""):
row = row[:-1].split(';')
res = int(row[1]) * float(row[2])
total += res
dest.write(str(res) + "\n")
row = ori.readline()
ori.close()
dest.close()
print(total)
read("products.txt")
A possibility would be to use csv from the standard library.
import csv
# fix files' paths
path1 = # file to read
path2 = # file to write
# read data and perform computations
rows_tot = []
with open(path1, 'r', newline='', encoding="utf-8") as fd:
reader = csv.DictReader(fd, delimiter=";")
for row in reader:
rows_tot.append(float(row['Amount']) * float(row['Price']))
# total sum
print("Total sum:", int(sum(rows_tot)))
# save to file the new data
with open(path2, 'w', newline='') as fd:
fieldnames = ("AmountXPrice",)
writer = csv.DictWriter(fd, fieldnames=fieldnames)
writer.writeheader()
for value in rows_tot:
writer.writerow({fieldnames[0]: f"{value:.2f}"})
Remark: it is not clear from the question the type of the various data, in case just change int with float or the other way around.

Write number of rows to file name

I dont have any idea how to update my code below to the level that I can write the amount of changed row - in the CSV - to the file name.
I have did some stuff with count and row, but it is making any sense.
Can somebody give me some tips
import csv
import glob
import os
rows = []
for in_csv in glob.glob('C:/code/convert/Image/In/*.csv'):
print in_csv
with open(in_csv, 'rb') as f_input:
reader = csv.reader(f_input)
try:
all = []
row = next(reader)
row.insert(0, 'l_panoramic')
all.append(row)
for k, row in enumerate(reader):
all.append(['l_panoramic_{:06}'.format(k)] + row)
name, ext = os.path.splitext(in_csv)
with open("{}_{}{}".format(name, len(rows), ext), 'wb') as f_output:
writer = csv.writer(f_output, delimiter = ';')
writer.writerows(all)
print " {} lines found".format(len(rows))
except StopIteration as e:
print " No lines found"
This could be done using the glob library to create your list of CSV files. Use splitext() to take then existing filename and split it into a filename and extension, the number of rows can then be easily added using a format() statement as follows:
import csv
import glob
import os
rows = []
for in_csv in glob.glob('a*.csv'):
print in_csv
with open(in_csv, 'rb') as f_input:
reader = csv.reader(f_input)
try:
row = next(reader)
row.insert(0, 'L_panoramic')
rows.append(row)
for k, row in enumerate(reader):
rows.append(['l_panoramic_{:06}'.format(k)] + row)
name, ext = os.path.splitext(in_csv)
with open("{}_{}{}".format(name, len(rows), ext), 'wb') as f_output:
writer = csv.writer(f_output, delimiter = ';')
writer.writerows(rows)
print " {} lines found".format(len(rows))
except StopIteration as e:
print " No lines found"
You were already creating a list of rows to be written, so once this list is complete, you will know how many rows there are. With this you can then open the output file with the number of rows added to the name, and write all the rows to it.

csv value modification for certain cells on odd rows on a particular column

Hi I'm trying to finish this small piece of code for modifying csv files, I've got this far with some help:
edit... some more info.
Basically what I’m looking to do is make some small changes to the csv file depending on the project and parent issue in JIRA. Python will then make the changes to the csv file before it is then read into JIRA - that’s the second part of the program I’ve not even really looked at yet.
I’m only looking to change the BOX-123 type cells and leave the blank ones blank.
But the idea of the program is that I can use it to make some small changes to a template which will then automatically create some issues in JIRA.
import os
import csv
project = 'Dudgeon'
parent = 'BOX-111'
rows = (1,1007)
current = os.getcwd()
filename = 'test.csv'
filepath = os.path.join(os.getcwd(), filename)
#print(current)
#print(filename)
print(filepath)
with open(filepath, 'r') as csvfile:
readCSV = csv.reader(csvfile)
next(readCSV, None)
for row in readCSV:
print(row[16])
row_count =sum(1 for row in readCSV)
print(row_count)
with open(filepath, 'r') as infile, open('out.csv', 'w') as outfile:
outfile.write(infile.readline()) # write out the 1st line
for line in infile:
cols = line.strip().split(',')
cols[16] = project
outfile.write(','.join(cols) + '\n')
with open('out.csv', 'r') as infile, open('out1.csv', 'w') as outfile:
for row in infile:
if row % 2 != 0:
cols [15] = parent
outfile.write()
Any help really appreciated.
You want to use the row's index when comparing to 0. Use enumerate():
with open('out.csv', 'r') as infile, open('out1.csv', 'w') as outfile:
for rowidx,row in enumerate(infile):
cols = row.strip().split(',')
if rowidx % 2 != 0:
cols[15] = parent
outfile.write(cols)
You really should be using the csv module here, though. Untested but should get you started.
with open('out.csv', 'r') as infile, open('out1.csv', 'w') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for rowidx,row in enumerate(reader):
if rowidx % 2 != 0:
row[15] = parent
writer.write_row(row)
A friend helped me last night and this is what they came up with:
with open(filepath, 'r') as infile, open('out.csv', 'w') as outfile:
outfile.write(infile.readline()) # write out the 1st line
for line in infile:
cols = line.strip().split(',')
cols[16] = project
outfile.write(','.join(cols) + '\n')
with open('out.csv', 'r') as infile, open('out1.csv', 'w') as outfile:
outfile.write(infile.readline()) # write out the 1st line
lineCounter = 0
for line in infile:
lineCounter += 1
cols = line.strip().split(',')
if lineCounter % 2 != 0:
cols[15] = parent
outfile.write(','.join(cols) + '\n')

Merge semicolon delimited txt file looping in directory

Suppose I have many different text files from the same directory with the content structure as shown below:
File a.txt:
HEADER_X;HEADER_Y;HEADER_Z
a_value;a_value;a_value
a_value;a_value;a_value
File b.txt:
HEADER_X;HEADER_Y;HEADER_Z
b_value;b_value;b_value
b_value;b_value;b_value
File c.txt:
HEADER_X;HEADER_Y;HEADER_Z
c_value;c_value;c_value
c_value;c_value;c_value
File d.txt: ...
I'd like to merge all of the txt files into one, by appending the content of each file to the final row of the each previous file. See below:
File combined.txt:
HEADER_X;HEADER_Y;HEADER_Z
a_value;a_value;a_value
a_value;a_value;a_value
b_value;b_value;b_value
b_value;b_value;b_value
c_value;c_value;c_value
c_value;c_value;c_value
...
How can I do this in Python?
Assumptions:
- all txt files are located in the same folder
- all txt files have same headers
- all txt files have same number of columns
- all txt files have different number of rows
Use the CSV Module. Something like this:
import csv
with ('output.csv', 'ab') as output:
writer = csv.writer(output, delimiter=";")
with open('a.txt', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=";")
reader.readline() // this is to skip the header
for row in spamreader:
writer.writerow(row)
If you didn't want to harcode in every file (Say you have many more than three) you could do:
from os import listdir
from os.path import isfile, join
onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
for aFile in onlyfiles:
with open(aFile, 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=";")
reader.readline() // this is to skip the header
for row in spamreader:
writer.writerow(row)
I managed to do something that seems to work (at least in the cases I tested).
This will parse all files, get all the headers and format the values on each line of each file to add ";" according to the headers present/absent on that file.
headers = []
values = []
files = ("csv0.txt", "csv1.txt")#put the files you want to parse here
#read the files a first time, just to get the headers
for file_name in files:
file = open(file_name, 'r')
first_line = True
for line in file:
if first_line:
first_line = False
for header in line.strip().split(";"):
if header not in headers:
headers.append(header)
else:
break
file.close()
headers = sorted(headers)
#read a second time to get the values
file_number = 0
for file_name in files:
file = open(file_name, 'r')
file_headers = []
first_line = True
corresponding_indexes = []
values.append([])
for line in file:
if first_line:
first_line = False
index = 0
for header in line.strip().split(";"):
while headers[index] != header:
index += 1
corresponding_indexes.append(index)
else:
line_values = line.strip().split(";")
current_index = 0
values_str = ""
for value in line_values:
#this part write the values with ";" added for the headers not in this file
while current_index not in corresponding_indexes:
current_index += 1
values_str += ";"
values_str += value + ";"
current_index += 1
values_str = values_str[:-1] #we remove the last ";" (useless)
values[file_number].append(values_str)
file_number += 1
file.close()
#and now we write the output file with all headers and values
headers_str = ""
for header in headers:
headers_str += header + ";"
headers_str = headers_str[:-1]
output_file = open("output.txt", 'w')
output_file.write(headers_str + "\n")
for file_values in values:
for values_line in file_values:
output_file.write(values_line + "\n")
output_file.close()
If you have any question, feel free to ask.

How to split code into smaller functions

I have an application that works. But in the interest of attempting to understand functions and python better. I am trying to split it out into various functions.
I"m stuck on the file_IO function. I'm sure the reason it does not work is because the main part of the application does not understand reader or writer. To better explain. Here is a full copy of the application.
Also I'm curious about using csv.DictReader and csv.DictWriter. Do either provide any advantages/disadvantages to the current code?
I suppose another way of doing this is via classes which honestly I would like to know how to do it that way as well.
#!/usr/bin/python
""" Description This script will take a csv file and parse it looking for specific criteria.
A new file is then created based of the original file name containing only the desired parsed criteria.
"""
import csv
import re
import sys
searched = ['aircheck', 'linkrunner at', 'onetouch at']
def find_group(row):
"""Return the group index of a row
0 if the row contains searched[0]
1 if the row contains searched[1]
etc
-1 if not found
"""
for col in row:
col = col.lower()
for j, s in enumerate(searched):
if s in col:
return j
return -1
#Prompt for File Name
def file_IO():
print "Please Enter a File Name, (Without .csv extension): ",
base_Name = raw_input()
print "You entered: ",base_Name
in_Name = base_Name + ".csv"
out_Name = base_Name + ".parsed.csv"
print "Input File: ", in_Name
print "OutPut Files: ", out_Name
#Opens Input file for read and output file to write.
in_File = open(in_Name, "rU")
reader = csv.reader(in_File)
out_File = open(out_Name, "wb")
writer = csv.writer(out_File, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
return (reader, writer)
file_IO()
# Read header
header = reader.next()
stored = []
writer.writerow([header[0], header[3]])
for i, row in enumerate(reader):
g = find_group(row)
if g >= 0:
stored.append((g, i, row))
stored.sort()
for g, i, row in stored:
writer.writerow([row[0], row[3]])
# Closing Input and Output files.
in_File.close()
out_File.close()
If I were you, I'd only separate find_group.
import csv
def find_group(row):
GROUPS = ['aircheck', 'linkrunner at', 'onetouch at']
for idx, group in enumerate(GROUPS):
if group in map(str.lower, row):
return idx
return -1
def get_filenames():
# this might be the only other thing you'd want to factor
# into a function, and frankly I don't really like getting
# user input this way anyway....
basename = raw_input("Enter a base filename (no extension): ")
infilename = basename + ".csv"
outfilename = basename + ".parsed.csv"
return infilename, outfilename
# notice that I don't open the files yet -- let main handle that
infilename, outfilename = get_filenames()
with open(infilename, 'rU') as inf, open(outfilename, 'wb') as outf:
reader = csv.reader(inf)
writer = csv.writer(outf, delimiter=',',
quotechar='"', quoting=csv.QUOTE_ALL)
header = next(reader)
writer.writerow([[header[0], header[3]])
stored = sorted([(find_group(row),idx,row) for idx,row in
enumerate(reader)) if find_group(row) >= 0])
for _, _, row in stored:
writer.writerow([row[0], row[3]])

Categories

Resources