Reorder the rows of a specific column in CSV - python

From a csv file, I'm trying to put in an ascending order the different rows of a big column (named CRIM) to do other manipulations after. First, I tried this:
def house_data():
with open('data.csv', newline='') as csvfile:
data = csv.DictReader(csvfile)
for line in data:
print(sorted(line['CRIM']))
But then it gave me a result of ordering every numbers in the values and not the value between them.
For example, if I had the number 1.96 and 0.92 , they would give me an output like this:
['1', '.','6', '9']
['0','.','2','9']
but I wanted
['0.92']
['1.96']
I read something about using the lambda and I tried this, but I didn't get any output.
def house_data():
with open('data.csv', newline='') as csvfile:
data = csv.DictReader(csvfile)
sorted(data, key=lambda line: line['CRIM'])
for line in data:
print(line['CRIM'])

use pandas
import pandas as pd
from pathlib import Path
file_path = Path('data.csv')
dataframe = pd.read_csv(file_path) # pass other required parameters.
dataframe.sort_values(['CRIM'])

First load all the data into a list and then sort the list using 'CRIM' as the key:
def house_data():
with open('data.csv', newline='') as csvfile:
data = csv.DictReader(csvfile)
lines = [] # all the lines
for line in data:
lines.append(line)
# or skip the for loop and do:
# lines = list(data)
# lines is a list of dictionaries
# now sort `lines` in-place using 'CRIM' as a float
lines.sort(key=lambda d: float(d['CRIM']))
return lines

Related

How to delete lines from csv file using python?

I have a CSV file:It contain the classes name and type of code smell and for each class Icalculated the number of a code smell .the final calcul is on the last line so there are many repeated classes name .
I need just the last line of the class name.
This is a part of my CSV file beacause it's too long :
NameOfClass,LazyClass,ComplexClass,LongParameterList,FeatureEnvy,LongMethod,BlobClass,MessageChain,RefusedBequest,SpaghettiCode,SpeculativeGenerality
com.nirhart.shortrain.MainActivity,NaN,NaN,NaN,NaN,NaN,NaN,1,NaN,NaN,NaN
com.nirhart.shortrain.path.PathParser,NaN,1,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
com.nirhart.shortrain.path.PathParser,NaN,1,NaN,1,NaN,NaN,NaN,NaN,NaN,NaN
com.nirhart.shortrain.path.PathParser,NaN,1,1,1,NaN,NaN,NaN,NaN,NaN,NaN
com.nirhart.shortrain.path.PathParser,NaN,1,2,1,NaN,NaN,NaN,NaN,NaN,NaN
com.nirhart.shortrain.path.PathParser,NaN,1,2,1,1,NaN,NaN,NaN,NaN,NaN
com.nirhart.shortrain.path.PathPoint,1,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
com.nirhart.shortrain.path.PathPoint,1,NaN,1,NaN,NaN,NaN,NaN,NaN,NaN,NaN
com.nirhart.shortrain.path.TrainPath,NaN,NaN,NaN,1,NaN,NaN,NaN,NaN,NaN,NaN
com.nirhart.shortrain.rail.RailActionActivity,NaN,NaN,NaN,1,NaN,NaN,NaN,NaN,NaN,NaN
com.nirhart.shortrain.rail.RailActionActivity,NaN,NaN,NaN,1,1,NaN,NaN,NaN,NaN,NaN
To filter out the last entry for groups of NameOfClass, you can make use of Python's groupby() function to return lists of rows with the same NameOfClass. The last entry from each can then be written to a file.
from itertools import groupby
import csv
with open('data_in.csv', newline='') as f_input, open('data_out.csv', 'w', newline='') as f_output:
csv_input = csv.reader(f_input)
csv_output = csv.writer(f_output)
for key, rows in groupby(csv_input, key=lambda x: x[0]):
csv_output.writerow(list(rows)[-1])
For the data you have given, this would give you the following output:
NameOfClass,LazyClass,ComplexClass,LongParameterList,FeatureEnvy,LongMethod,BlobClass,MessageChain,RefusedBequest,SpaghettiCode,SpeculativeGenerality
com.nirhart.shortrain.MainActivity,NaN,NaN,NaN,NaN,NaN,NaN,1,NaN,NaN,NaN
com.nirhart.shortrain.path.PathParser,NaN,1,2,1,1,NaN,NaN,NaN,NaN,NaN
com.nirhart.shortrain.path.PathPoint,1,NaN,1,NaN,NaN,NaN,NaN,NaN,NaN,NaN
com.nirhart.shortrain.path.TrainPath,NaN,NaN,NaN,1,NaN,NaN,NaN,NaN,NaN,NaN
com.nirhart.shortrain.rail.RailActionActivity,NaN,NaN,NaN,1,1,NaN,NaN,NaN,NaN,NaN
To get just the unique class names (ignoring repeated rows, not deleting them), you can do this:
import csv
with open('my_file.csv', 'r') as csvfile:
reader = csv.reader(csvfile)
classNames = set(row[0] for row in reader)
print(classNames)
# {'com.nirhart.shortrain.MainActivity', 'com.nirhart.shortrain.path.PathParser', 'com.nirhart.shortrain.path.PathPoint', ...}
This is just using the csv module to open a file, getting the first value in each row, and then taking only the unique values of those. You can then manipulate the resulting set of strings (you might want to cast it back to a list via list(classNames)) however you need to.
If you intend to later process the data in pandas, filtering duplicates is trivial:
import pandas as pd
df = pd.read_csv('file.csv')
df = df.loc[~df.NameOfClass.duplicated(keep='last')]
If you just want to build a new csv file with only the expected lines, pandas is overkill and the csv module is enough:
import csv
with open('file.csv') as fdin, file('new_file.csv', 'w', newline='') as fdout:
rd = csv.reader(fdin)
wr = csv.writer(fdout)
wr.writerow(next(rd)) # copy the header line
old = None
for row in rd:
if old is not None and old[0] != row[0]:
wr.writerow(old)
old = row
wr.writerow(old)

How to get specific columns in a certain range from a csv file without using pandas

For some reason the pandas module does not work and I have to find another way to read a (large) csv file and have as Output specific columns within a certain range (e.g. first 1000 lines). I have the code that reads the entire csv file, but I haven't found a way to display just specific columns.
Any help is much appreciated!
import csv
fileObj = open('apartment-data-all-4-xaver.2018.csv')
csvReader = csv.reader( fileObj )
for row in csvReader:
print row
fileObj.close()
I created a small csv file with the following contents:
first,second,third
11,12,13
21,22,23
31,32,33
41,42,43
You can use the following helper function that uses namedtuple from collections module, and generates objects that allows you to access your columns like attributes:
import csv
from collections import namedtuple
def get_first_n_lines(file_name, n):
with open(file_name) as file_obj:
csv_reader = csv.reader(file_obj)
header = next(csv_reader)
Tuple = namedtuple('Tuple', header)
for i, row in enumerate(csv_reader, start=1):
yield Tuple(*row)
if i >= n: break
If you want to print first and third columns, having n=3 lines, you use the method like this (Python 3.6 +):
for line in get_first_n_lines(file_name='csv_file.csv', n=3):
print(f'{line.first}, {line.third}')
Or like this (Python 3.0 - 3.5):
for line in get_first_n_lines(file_name='csv_file.csv', n=3):
print('{}, {}'.format(line.first, line.third))
Outputs:
11, 13
21, 23
31, 33
use csv dictreader and then filter out specific rows and columns
import csv
data = []
with open('names.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
data.append(row)
colnames = ['col1', 'col2']
for i in range(1000):
print(data[i][colnames[0]], data[i][colnames[1]])

Generate a new csv file and order data in ascending numeric order?

I have written a code that implements the given regex on every postcode that is included in the 'import_data.csv' file. It then generates a new csv file 'failed_validation.csv' which contains all the postcodes where the validation fails. The structure of both files is in the following format:
row_id postcode
134534 AABC 123
243534 AACD 4PQ
534345 QpCD 3DR
... ...
Following is my code:
import csv
import re
regex = r"(GIR\s0AA)|((([A-PR-UWYZ][0-9][0-9]?)|(([A-PR-UWYZ][A-HK-Y][0-9]((BR|FY|HA|HD|HG|HR|HS|HX|JE|LD|SM|SR|WC|WN|ZE)[0-9])[0-9])|([A-PR-UWYZ][A-HK-Y](AB|LL|SO)[0-9])|(WC[0-9][A-Z])|(([A-PR-UWYZ][0-9][A-HJKPSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRVWXY]))))\s[0-9][ABD-HJLNP-UW-Z]{2})"
codes = []
with open('../import_data.csv','r') as f:
r = csv.reader(f, delimiter=',')
for row in r:
if not(re.findall(regex, row[1])):
codes.append([row[0],row[1]])
with open('failed_validation.csv','w',newline='') as fp:
a = csv.writer(fp)
a.writerows(codes)
The code works fine but what I actually want is the postcodes in the new file need to be ordered as per the row_id, in ascending numeric order. I know how to generate a new file with Python, but I don't know how to order the data inside that file in ascending numeric order.
This will do it and preserve the header row:
import csv
import re
regex = r"(GIR\s0AA)|((([A-PR-UWYZ][0-9][0-9]?)|(([A-PR-UWYZ][A-HK-Y][0-9]((BR|FY|HA|HD|HG|HR|HS|HX|JE|LD|SM|SR|WC|WN|ZE)[0-9])[0-9])|([A-PR-UWYZ][A-HK-Y](AB|LL|SO)[0-9])|(WC[0-9][A-Z])|(([A-PR-UWYZ][0-9][A-HJKPSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRVWXY]))))\s[0-9][ABD-HJLNP-UW-Z]{2})"
codes = []
with open('import_data.csv', 'r', newline='') as fp:
reader = csv.reader(fp, delimiter=',')
header = next(reader)
for row in reader:
if not re.findall(regex, row[1]):
codes.append([row[0],row[1]])
with open('failed_validation.csv', 'w', newline='') as fp:
writer = csv.writer(fp)
writer.writerow(header)
writer.writerows(sorted(codes))
Sort your codes list before writing to the file.
headers = codes[0]
codes = sorted(codes[1:])
with open('failed_validation.csv','w',newline='') as fp:
a = csv.writer(fp)
a.writerow(header)
a.writerows(codes)

Python - splitting data as columns in csv file

I have data in a csv file that looks like that is imported as this.
import csv
with open('Half-life.csv', 'r') as f:
data = list(csv.reader(f))
the data will come out as this to where it prints out the rows like data[0] = ['10', '2', '2'] and so on.
What i'm wanting though is to retrieve the data as columns in instead of rows, to where in this case, there are 3 columns.
You can create three separate lists, and then append to each using csv.reader.
import csv
c1 = []
c2 = []
c3 = []
with open('Half-life.csv', 'r') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
c1.append(row[0])
c2.append(row[1])
c3.append(row[2])
A little more automatic and flexible version of Alexander's answer:
import csv
from collections import defaultdict
columns = defaultdict(list)
with open('Half-life.csv', 'r') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
for i in range(len(row)):
columns[i].append(row[i])
# Following line is only necessary if you want a key error for invalid column numbers
columns = dict(columns)
You could also modify this to use column headers instead of column numbers.
import csv
from collections import defaultdict
columns = defaultdict(list)
with open('Half-life.csv', 'r') as f:
reader = csv.reader(f, delimiter=',')
headers = next(reader)
column_nums = range(len(headers)) # Do NOT change to xrange
for row in reader:
for i in column_nums:
columns[headers[i]].append(row[i])
# Following line is only necessary if you want a key error for invalid column names
columns = dict(columns)
Another option, if you have numpy installed, you can use loadtxt to read a csv file into a numpy array. You can then transpose the array if you want more columns than rows (I wasn't quite clear on how you wanted the data to look). For example:
import numpy as np
# Load data
data = np.loadtxt('csv_file.csv', delimiter=',')
# Transpose data if needs be
data = np.transpose(data)

Convert a csv to a dictionary with multiple values?

I have a csv file like this:
pos,place
6696,266835
6698,266835
938,176299
940,176299
941,176299
947,176299
948,176299
949,176299
950,176299
951,176299
770,272944
2751,190650
2752,190650
2753,190650
I want to convert it to a dictionary like the following:
{266835:[6696,6698],176299:[938,940,941,947,948,949,950,951],190650:[2751,2752,2753]}
And then, fill the missing numbers in the range in the values:
{{266835:[6696,6697,6698],176299:[938,939,940,941,942,943,944,945,946947,948,949,950,951],190650:[2751,2752,2753]}
}
Right now i have tried to build the dictionary using solution suggested here, but it overwrites the old value with new one.
Any help would be great.
Here is a function that i wrote for converting csv2dict
def csv2dict(filename):
"""
reads in a two column csv file, and the converts it into dictionary
"""
import csv
with open(filename) as f:
f.readline()#ignore first line
reader=csv.reader(f,delimiter=',')
mydict=dict((rows[1],rows[0]) for rows in reader)
return mydict
Easiest is to use collections.defaultdict() with a list:
import csv
from collections import defaultdict
data = defaultdict(list)
with open(inputfilename, 'rb') as infh:
reader = csv.reader(infh)
next(reader, None) # skip the header
for col1, col2 in reader:
data[col2].append(int(col1))
if len(data[col2]) > 1:
data[col2] = range(min(data[col2]), max(data[col2]) + 1)
This also expands the ranges on the fly as you read the data.
Based on what you have tried -
from collections import default dict
# open archive reader
myFile = open ("myfile.csv","rb")
archive = csv.reader(myFile, delimiter=',')
arch_dict = defaultdict(list)
for rows in archive:
arch_dict[row[1]].append(row[0])
print arch_dict

Categories

Resources