Creating new CSV with selected columns from another - python

I'm trying to create a script that selects some columns from a CSV file and saves them into another one (ideally specifying the column header).
This is the query I'm starting from, which will copy all columns. How to change that to copy just a selection of them?
# importing openpyxl module
import openpyxl as xl;
# opening the source excel file
filename ="C:\\Users\\...\\input.clv"
wb1 = xl.load_workbook(filename)
ws1 = wb1.worksheets[0]
# opening the destination excel file
filename1 ="C:\\Users\\...\\output.clv"
wb2 = xl.load_workbook(filename1)
ws2 = wb2.active
# calculate total number of rows and
# columns in source excel file
mr = ws1.max_row
mc = ws1.max_column
# copying the cell values from source
# excel file to destination excel file
for i in range (1, mr + 1):
for j in range (1, mc + 1):
# reading cell value from source excel file
c = ws1.cell(row = i, column = j)
# writing the read value to destination excel file
ws2.cell(row = i, column = j).value = c.value
# saving the destination excel file
wb2.save(str(filename1))
Thank you in advance!

This is how I might do it using Python file reading/writing.
def readCsv(fileName):
data = []
myFile = open(fileName, "r")
for line in myFile:
lineList = line.split(",")
lineList[len(lineList)-1] = lineList[len(lineList) - 1].replace("\n", "")
data.append(lineList)
myFile.close()
return data
def writeCsv(data):
dataString = ""
for line in data:
dataString =dataString + ','.join(line)+"\n"
myNewFile = open("output.csv", "w")
myNewFile.write(dataString)
myNewFile.close()
data = readCsv("yourCsv.csv")
# Remove the data you don't need
writeCsv(dataAfterRemovingColumns)
My readCsv function produces a 2D List where each item is a list of the data in one row of the CSV file. So, where I have commented # Remove the data you don't need you would iterate through the 2D list, removing the item from each row that makes up part of the column you want to delete. Hope that makes sense!

You can use CSV from stdlib:
#!/usr/bin/env python
import csv
inputCsvFilePath = 'input.csv'
outputCsvFilePath = 'output.csv'
inputCsvColumnNumbers = [1,3,5]
outputCsvColumnHeaders = ['one', 'three', 'five']
# reading/writing row by row (high IO, low memory):
with open(inputCsvFilePath) as inputCsv:
inputCsvReader = csv.reader(inputCsv)
with open(outputCsvFilePath, 'w') as outputCsv:
outputCsvWriter = csv.writer(outputCsv)
# write custom csv header:
outputCsvWriter.writerow(outputCsvColumnHeaders)
# skip input file header:
inputCsvReader.__next__()
for inputRow in inputCsvReader:
outputCsvWriter.writerow( [inputRow[i] for i in inputCsvColumnNumbers] )
Personally, I'd use sqlite for that:
#!/bin/bash
sqlite3 <<EOF
-- input:
.separator ',' "\n"
.import 'input.csv' inputData
-- output:
.mode csv
.header on
.once 'output.csv'
select
user_id as "one"
, login_id as "three"
, password as "five"
from inputData
;
EOF

Based on your purpose of saving some columns from a csv file to another, you can utilize pandas library as follows:
import pandas as pd
def save_csv(df, path, cols):
df[cols].to_csv(path, index=False)
with open('path/to/csv', r) as f:
df = pd.read_csv(f)
# Assuming you want to save columns colA and colB
save_csv(df, path/to/dest/csv, ['colA', 'colB'])
You can also use csv DictReader, DictWriter as another approach which is longer in code, but faster in time (based on my timing):
import csv
def use_csv():
def new_dict(d, cols):
new_dict = {}
for col in cols:
new_dict[col] = d[col]
return new_dict
with open('path/to/csv', 'r') as f:
df = csv.DictReader(f)
with open('path/to/dest/csv', 'w') as csvfile:
fieldnames = ['colA', 'colB']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in df:
data = new_dict(row, fieldnames)
writer.writerow(data)

Related

Create csv file using python, where all the values are seperated after first spacing and creates one column

I need help to convert simple_line.txt file to csv file using the pandas library. However, I am unable to categorize image file where i want to create all the values after first space in one column.
Here is the file (sample_list.txt), listed row by row:
Image Label
doc_pres223.jpg Durasal
doc_pres224.jpg Tab Cefepime
doc_pres225.jpg Tab Bleomycin
doc_pres226.jpg Budesonide is a corticosteroid,
doc_pres227.jpg prescribed for inflammatory,
I want the csv file to be like-
enter image description here
txt_file = r"./example.txt"
csv_file = r"./example.csv"
separator = "; "
with open(txt_file) as f_in, open(csv_file, "w+") as f_out:
for line in f_in:
f_out.write(separator.join(line.split(" ", maxsplit=1)))
try this:
import pandas as pd
def write_file(filename, output):
df = pd.DataFrame()
lines = open(filename, 'r').readlines()
for l in range(1, len(lines)):
line = lines[l]
arr = line.split(" ", maxsplit=1)
image_line = arr[0]
label_line = arr[1].replace('\n', '')
df = df.append({'Image': image_line, 'Label': label_line}, ignore_index=True)
df.to_csv(output)
if __name__ == '__main__':
write_file('example.txt', 'example.csv')
If the filenames in column Image is always the same length, then you could just treat is as a fixed width file. So the first column would be 15 characters, and the rest is the second column. Then just add two empty columns and write it to a new file.
# libraries
import pandas as pd
# set filename
filename = "simple_line.txt"
# read as fixed width
df = pd.read_fwf(filename, header=0, widths=[15, 100])
# add 2 empty columns
df.insert(1, 'empty1', '')
df.insert(2, 'empty2', '')
# save as a new csv file
filenew = "output.csv"
df.to_csv(filenew, sep=';', header=True, index=False)

Split values in CSV that look like JSON

So I have a CSV file with a column called content. However, the contents in column look like it is based on JSON, and, therefore, house more columns. I would like to split these contents into multiple columns or extract the final part of it after "value". See picture below to see an example of the file. Any ideas how to get this? I would prefer using Python. I don't have any experience with JSON.
Using pandas you could do in a simpler way.
EDIT updated to handle the single quotes:
import pandas as pd
import json
data = pd.read_csv('test.csv', delimiter="\n")["content"]
res = [json.loads(row.replace("'", '"')) for row in data]
result = pd.DataFrame(res)
result.head()
# Export result to CSV
result.to_csv("result.csv")
my csv:
result:
This script will create a new csv file with the 'value' added to the csv as an additional column
(make sure that the input_csv and output_csv are different filenames)
import csv
import json
input_csv = "data.csv"
output_csv = "data_updated.csv"
values = []
with open(input_csv) as f_in:
dr = csv.DictReader(f_in)
for row in dr:
value = json.loads(row["content"].replace("'", '"'))["value"]
values.append(value)
with open(input_csv) as f_in:
with open(output_csv, "w+") as f_out:
w = csv.writer(f_out, lineterminator="\n")
r = csv.reader(f_in)
all = []
row = next(r)
row.append("value")
all.append(row)
i = 0
for row in r:
row.append(values[i])
all.append(row)
i += 1
w.writerows(all)

writing the rows of a csv file to another csv file

I want to write the rows of a csv file to another csv file. I want to change the content of each row as well in a way that if the row is empty, it remains empty and if it is not, any spaces at the beginning and end of the string are omitted. The original csv file has one column and 65422771 rows.
I have written the following to write the rows of the original csv file to the new one:
import csv
csvfile = open('data.csv', 'r')
with open('data 2.csv', "w+") as csv_file1:
writer = csv.writer(csv_file1)
count = 0
for row in csvfile:
row = row.replace('"', '')
count+= 1
print(count)
if row.strip() == '':
writer.writerow('\n')
else:
writer.writerow(row)
However, when the new csv file is made, it is shown that it has 130845543 rows (= count)! The size of the new csv file is also 2 times the size of the original one. How can I create the new csv file with exactly the same number of rows but with the mentioned changes made to them?
Try this:
import csv
with open('data.csv', 'r') as file:
rows = [[row[0].strip()] for row in csv.reader(file)]
with open('data_out.csv', "w", newline = "") as file:
writer = csv.writer(file)
writer.writerows(rows)
Also, as #tripleee mentioned, your file is quite large so you may want to read / write it in chunks. You can use pandas for that.
import pandas as pd
chunksize = 10_000
for chunk in pd.read_csv('data.csv', chunksize = chunksize, header = None):
chunk[0] = chunk[0].str.strip()
chunk.to_csv("data_out.csv", mode="a", header = False, index = False)

How do i write csv basis on comparing two csv file[column based]

I have two csv files:
csv1
csv2
(*note headers can be differ)
csv1 has 1 single column an csv2 has 5 columns
now column 1 of csv1 has some matching values in column2 of csv2
my concern is how can i write a csv where column1 of csv1 does not have a MATCHING VALUES to column2 of csv2
I have attached three files csv1, csv2 and expected output..
Expected Output:
ProfileID,id,name,class ,rollnumber
1,lkha,prince,sfasd,DAS
2,hgfhfk,kabir,AD,AD
5,jlh,antriskh,ASDA,AD
CSV 1:
id,name
10927,prince
109582,kabir
f546416,rahul
g44674,saini
r7341,antriskh
CSV 2:
ProfileID,id,name,class ,rollnumber
1,lkha,prince,sfasd,DAS
2,hgfhfk,kabir,AD,AD
3,f546416,rahul,AD,FF
44,g44674,saini,DD,FF
5,jlh,antriskh,ASDA,AD
I tried using converting them into dictionary and match them csv1 keys to csv2 values but it is not working as expected
def read_csv1(filename):
prj_structure = {}
f = open(filename, "r")
data = f.read()
f.close()
lst = data.split("\n")
prj = ""
for i in range(0, len(lst)):
val = lst[i].split(",")
if len(val)>0:
prj = val[0]
if prj!="":
if prj not in prj_structure.keys():
prj_structure[prj] = []
prj_structure[prj].append([val[1], val[2], val[3], val[4])
return prj_structure
def read_csv2(filename):
prj_structure = {}
f = open(filename, "r")
data = f.read()
f.close()
lst = data.split("\n")
prj = ""
for i in range(0, len(lst)):
val = lst[i].split(",")
if len(val)>0:
prj = val[0]
if prj!="":
if prj not in prj_structure.keys():
prj_structure[prj] = []
prj_structure[prj].append([val[0])
return prj_structure
csv1_data = read_csv1("csv1.csv")
csv2_data = read_csv2("csv2.csv")
for k, v in csv1_data.items():
for ks, vs in csv2_data.items():
if k==vs[0][0]:
#here it is not working
sublist = []
sublist.append(k)
Use the DictReader from the csv package.
import csv
f1 = open('csv1.csv')
csv_1 = csv.DictReader(f1)
f2 = open('csv2.csv')
csv_2 = csv.DictReader(f2)
first_dict = {}
for row in csv_1:
first_dict[row['name']]=row
f1.close()
f_out = open('output.csv','w')
csv_out = csv.DictWriter(f_out,csv_2.fieldnames)
csv_out.writeheader()
for second_row in csv_2:
if second_row['name'] in first_dict:
first_row = first_dict[second_row['name']]
if first_row['id']!=second_row['id']:
csv_out.writerow(second_row)
f2.close()
f_out.close()
If you have the option, I have always found pandas as a great tool to import and manipulate CSV files.
import pandas as pd
# Read in both the CSV files
df_1 = pd.DataFrame(pd.read_csv('csv1.csv'))
df_2 = pd.DataFrame(pd.read_csv('csv2.csv'))
# Iterate over both DataFrames and if any id's from in df_2 match
# df_1, remove them from df_2
for num1, row1 in df_1.iterrows():
for num2, row2 in df_2.iterrows():
if row1['id'] == row2['id']:
df_2.drop(num2, inplace=True)
df_2.head()
For any kind of csv processing, using the builtin csv module makes most of the error prone processing trivial. Given your example values, the following code should produce the desired results. I use comprehensions to do the filtering.
import csv
import io
# example data, as StringIO that will behave like file objects
raw_csv_1 = io.StringIO('''\
id,name
10927,prince
109582,kabir
f546416,rahul
g44674,saini
r7341,antriskh''')
raw_csv_2 = io.StringIO('''\
ProfileID,id,name,class,rollnumber
1,lkha,prince,sfasd,DAS
2,hgfhfk,kabir,AD,AD
3,f546416,rahul,AD,FF
44,g44674,saini,DD,FF
5,jlh,antriskh,ASDA,AD''')
# in your actual data, you would use actual file objects instead, like
# with open('location/of/your/csv_1') as file_1:
# raw_csv_1 = file_1.read()
# with open('location/of/your/csv_2') as file_2:
# raw_csv_2 = file_2.read()
Then we need to transform then into csv.reader objects:
csv_1 = csv.reader(raw_csv_1)
next(csv_1) # consume once to skip the header
csv_2 = csv.reader(raw_csv_2)
header = next(csv_2) # consume once to skip the header, but store it
Last but not least, collect the names of the first csv in a set to use them as lookup table, filter the second csv with it, and write it back as 'result.csv' into your file system.
skip_keys = {id_ for id_, name in vals_1}
result = [row for row in vals_2 if row[1] not in skip_keys]
# at this point, result contains
# [['1', 'lkha', 'prince', 'sfasd', 'DAS'],
# ['2', 'hgfhfk', 'kabir', 'AD', 'AD'],
# ['5', 'jlh', 'antriskh', 'ASDA', 'AD']]
with open('result.csv', 'w') as result_file:
csv.writer(result_file).writerows(header+result)

Copy a column from a txt file to a column in Excel file, Python

I am trying to copy the contents of first three columns of a txt file to the first three columns of a Excel file.
Here is my code:
import XlsxWriter
worksheet1 = workbook.add_worksheet()
worksheet1.write('A1', 'Time', bold);worksheet1.write('B1', 'User Value', bold);worksheet1.write('C1','Address', bold);worksheet1.write('D1', 'Serial Number', bold);
items = os.listdir(directory)
for FILE in items:
if FILE.endswith('file.txt'):
FileSelection = directory+'/' + FILE
Array1 = []
with open(FileSelection, 'r') as f:
for line in f:
valuesList = line.split('\t')
#print valuesList
Array1.append(valuesList)
for j in range(len(Array1)):
if j == 0:
continue
else:
print Array1[j][0]
worksheet1.write('A2:D2', Array1[j][0]) #I want to say, copy the columns A to D but start from the second raw
However it copies whole the txt array to first column of Excel file!
It looks like the 'text file' you want is actually a csv, only with a different delimiter (This is a bit of a confusing convention). You can use the csv module to specify this:
>>> import csv
>>> with open('your file.txt', 'rb') as csvfile:
... reader = csv.reader(csvfile, delimiter='\t')
... for row in reader:
... print ', '.join(row)
There is a similar csv writer module, or do you actually want an excel file?
I have also noticed in your code maybe valueslist should truncate the first 3 columns, as that is what you appear to want...
This is probably why you are getting all of the original file. So change
valuesList = line.split('\t')
to
valuesList = line.split('\t')[:3]

Categories

Resources