Pandas read_table columns with multiple lines - python

I am working with a text file (ClassTest.txt) and pandas. The text file has 3, tab-separated columns: Title, Description, and Category - Title and Description are normal strings and Category is a (non-zero) integer.
I was gathering the data as follows:
data = pd.read_table('ClassTest.txt')
feature_names = ['Title', 'Description']
X = data[feature_names]
y = data['Category']
However, because values in the Description column can themselves contain new lines, the 'y' DataFrame contains too many rows because of most of the items in the Description column having multiple lines. I attempted to get around this by making the newline character in the file to be '|' (by repopulating it) and using:
data = pd.read_table('ClassTest.txt', lineterminator='|')
X = data[feature_names]
y = data['Category']
This time, I get the error:
pandas.errors.ParserError: Error tokenizing data. C error: Expected 3 fields in line 20, saw 5
Can anyone help me with this issue?
EDIT: Adding previous code
con = lite.connect('JobDetails.db')
cur = con.cursor()
cur.execute('''SELECT Title, Description, Category FROM ReviewJobs''')
results = [list(each) for each in cur.fetchall()]
cur.execute('''SELECT Title, Description, Category FROM Jobs''')
for each in cur.fetchall():
results.append(list(each))
a = open('ClassTest.txt', 'ab')
newLine = "|"
a.write(u''.join(c for c in 'Title\tDescription\tCategory' + newLine).encode('utf-8'))
for r in results:
toWrite = "".encode('utf-8')
title = u''.join(c for c in r[0].replace("\n", " ")).encode('utf-8') + "\t".encode('utf-8')
description = u''.join(c for c in r[1]).encode('utf-8') + "\t".encode('utf-8')
toWrite += title + description
toWrite += str(r[2]).encode('utf-8') + newLine.encode('utf-8')
a.write(toWrite)
a.close()

pandas.read_table() is deprecated – use read_csv() instead. And then really use the CSV format instead of writing lots of code to write something similar that can't cope with record or field delimiters within fields. There's the csv module in the Python standard library.
Opening the file as text file and passing the encoding to open() spares you from encoding everything yourself in different places.
#!/usr/bin/env python3
from contextlib import closing
import csv
import sqlite3
def main():
with sqlite3.connect("JobDetails.db") as connection:
with closing(connection.cursor()) as cursor:
#
# TODO Having two tables with the same columns for essentially
# the same kind of records smells like a broken DB design.
#
rows = list()
for table_name in ["reviewjobs", "jobs"]:
cursor.execute(
f"SELECT title, description, category FROM {table_name}"
)
rows.extend(cursor.fetchall())
with open("ClassTest.txt", "a", encoding="utf8") as csv_file:
writer = csv.writer(csv_file, delimiter="\t")
writer.write(["Title", "Description", "Category"])
for title, description, category in rows:
writer.writerows([title.replace("\n", " "), description, category])
if __name__ == "__main__":
main()
And the in the other program:
data = pd.read_csv("ClassTest.txt", delimiter="\t")

Related

How to read fields from JSON-LD to CSV?

I am trying to extract values from json ld to csv as they are in the file. There are a couple of issues I am facing.
1. The values being read for different fields are getting truncated in most of the cases. In the remaining cases the value of some other field is appearing in some other field.
2. I am also getting an error - 'Additional data' after some 4,000 lines.
The file is quite big(half a gb). I am attaching a shortened version of my code. Please tell me where am I going wrong.
The input file - I have shortened it and kept it here. There was no way of putting it here.
https://github.com/Architsi/json-ld-issue
I tried writing this script and I tried multiple online converters too
import csv, sys, math, operator, re, os, json, ijson
from pprint import pprint
filelist = []
for file in os.listdir("."):
if file.endswith(".json"):
filelist.append(file)
for input in filelist:
newCsv = []
splitlist = input.split(".")
output = splitlist[0] + '.csv'
newFile = open(output, 'w', newline='') #wb for windows, else you'll see newlines added to csv
# initialize csv writer
writer = csv.writer(newFile)
#Name of the columns
header_row = ('Format', 'Description', 'Object', 'DataProvider')
writer.writerow(header_row)
with open(input, encoding="utf8") as json_file:
data = ijson.items(json_file, 'item')
#passing all the values through try except
for s in data:
source = s['_source']
try:
source_resource = source['sourceResource']
except:
print ("Warning: No source resource in record ID: " + id)
try:
data_provider = source['dataProvider'].encode()
except:
data_provider = "N/A"
try:
_object = source['object'].encode()
except:
_object = "N/A"
try:
descriptions = source_resource['description']
string = ""
for item in descriptions:
if len(descriptions) > 1:
description = item.encode() #+ " | "
else:
description = item.encode()
string = string + description
description = string.encode()
except:
description = "N/A"
created = ""
#writing it to csv
write_tuple = ('format', description, _object, data_provider)
writer.writerow(write_tuple)
print ("File written to " + output)
newFile.close()
The error that I am getting is this- raise common.JSONError('Additional Data')
Expected result is a csv file with all the columns and correct values

Python Loop writing to CSV files

I am trying to generate a csv file for each query output. I have multiple select queries (queries.sql) in a single SQL file and i am looping through it to execute in database and write each query output to its own csv file. When i execute the code all queries are executing in the database but only the last query result set is being written to csv file, rest all csv files are with no records. Any help is appreciated.
col_pattern = "(^|[_-])SSN#?($|[_-])|^SS#?$|(^|[_-])(SSN|SOC.*SEC.*).?(ID|NO|NUMBERS?|NUM|NBR|#)($|[_-])|^SOCIAL.?SEC(URITY)?#?$"
SQL = "select OWNER||'_'||TABLE_NAME||'_'||column_name from ALL_TAB_COLS where REGEXP_LIKE (column_name, :1) and owner NOT IN ('SYS','SYSMAN') order by table_name,column_name"
cursor.execute(SQL,(col_pattern,))
for row_data in cursor:
if not row_data[0].startswith('BIN$'):
fileName = row_data[0]
csv_file_dest = "/u01/exp/test/identity/csvdata/"+ fileName + ".csv"
outputFile = open(csv_file_dest,'w') # 'wb'
output = csv.writer(outputFile, dialect='excel')
f = open('/u01/exp/test/identity/queries.sql')
full_sql = f.read()
sql_commands = full_sql.replace('\n', "").split(';')[:-1]
#print(sql_commands)
for sql_command in sql_commands:
curs2 = cursor.execute(sql_command)
if printHeader: # add column headers if requested
cols = []
for col in curs2.description:
cols.append(col[0])
output.writerow(cols)
for row_data in curs2: # add table rows
output.writerow(row_data)
outputFile.close()
Looks like it is because all the SQL data is being written to whatever the last thing is that the variable "output" was set to. So the file that the variable "output" is set to is just being constantly overwritten.

Using pandas to generate new csv by comparing row2 of each, and excluding duplicates

I want to take data from a csv file, let's call it A (contains user account info, with passwords, for active users in a program called IXL) and compare it to a csv file, call it B, that auto generates for all current active students, and create a new csv file that contains all info from A plus all info from B, except for rows where there are duplicates in column 3 (aka row2 of hallpass_raw.csv).
The python script I have so far, only generates the B csv file which I want to compare to the A csv file.
The A csv file contains current users and their current passwords. the B csv file would also contain those users but it creates a password which would cause current users to not be able to login, so I want to exclude users where there are duplicates...
import csv
import subprocess
def ixl():
with open(r'C:\Users\sftp\PS\IMPORTED\hallpass_raw.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
with open(r'C:\Users\sftp\PS\IMPORTED\pythonscripts\ixl\ixl_raw.csv', mode='w', newline='') as output_file:
write = csv.writer(output_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
for row in csv_reader:
subjects = 'Math, ELA'
a = row[0]
b = row[1]
c = row[2]
studentnumber = row[3]
firstname = row[4]
middlename = row[5]
lastname = row[6]
teacher = row[7]
idnumber = row[8]
phone = row[12]
gradelevel = int(row[13])
fatheremail = row[14]
motheremail = row[15]
studentemail = firstname + "." + lastname.replace(" ", "") + "#domain.org"
username = idnumber + "#domain"
password = idnumber + lastname
father = row[9]
# Only writes where grade is 3, 5, 7, or 9-12
if (gradelevel == 5) or (gradelevel == 3) or (gradelevel == 7) or (gradelevel > 8):
write.writerow([lastname, firstname, idnumber, studentemail, gradelevel, teacher, subjects, username, password])
def main():
"""
Controls the program execution
:param in_file: the name of the input file.
:return: None
"""
ixl()
if __name__ == '__main__':
main()
Here are the expected results:
A csv file currentusers.csv contains (row2 is the student idnumber):
john,smith,5456,john.smith#example.com,11,Jones,"Math,ELA",5456#domain,pass23
tara,smith,1234,tara.smith#example.com,10,North,"Math,ELA",1234#domain,pass67
matt,wells,2345,matt.wells#example.com,9,Jones,"Math,ELA",2345#domain,pass76
tony,dean,3456,tony.dean#example.com,12,Sims,"Math,ELA",3456#domain,pass19
eric,watts,4567,eric.watts#example.com,7,Sims,"Math,ELA",4567#domain,pass12
B csv file ixl_raw.csv contains: (this csv file is generated by the current script shown above, which contains all active students and will include users from A csv file, which causes the need to search for and remove duplicates based on row2)
john,smith,5456,john.smith#example.com,11,Jones,"Math,ELA",5456#domain,5456smith
tara,smith,1234,tara.smith#example.com,10,North,"Math,ELA",1234#domain,1234smith
matt,wells,2345,matt.wells#example.com,9,Jones,"Math,ELA",2345#domain,2345wells
tony,dean,3456,tony.dean#example.com,12,Sims,"Math,ELA",3456#domain,3456dean
eric,watts,4567,eric.watts#example.com,7,Sims,"Math,ELA",4567#domain,4567watts
new,student,5678,new.student#example.com,5,Parks,"Math,ELA:,5678#domain,5678student
If I upload B csv file to the IXL system, it would overwrite current users passwords, and current users would have no idea what their changed password is, and therefore could no longer sign in.
So instead, I want to compare A and B csv files, and generate a new csv file that would look like this:
john,smith,5456,john.smith#example.com,11,Jones,"Math,ELA",5456#domain,pass23
tara,smith,1234,tara.smith#example.com,10,North,"Math,ELA",1234#domain,pass67
matt,wells,2345,matt.wells#example.com,9,Jones,"Math,ELA",2345#domain,pass76
tony,dean,3456,tony.dean#example.com,12,Sims,"Math,ELA",3456#domain,pass1
eric,watts,4567,eric.watts#example.com,7,Sims,"Math,ELA",4567#domain,pass12
new,student,5678,new.student#example.com,5,Parks,"Math,ELA:,5678#domain,5678student
As you can see, this includes all current users from A and all users from B except those in B where there is a duplicate in the third column (aka row 2 of ixl_raw.csv, which is the student id number)
I may be misinterpreting your explanation of duplicate. It sounds like you want to Right Join on B, which you can see an example of here: http://www.sql-join.com/sql-join-types/
If that's the case:
import pandas as pd
A = pd.read_csv("your/location/A.csv")
B = pd.read_csv("your/location/B.csv")
# Join with pandas merge; you're .csv's must have student_id column labeled to use this
C = pd.merge(A, B, on='student_id', how='right')
# create a new csv with joined data
C.to_csv("your/location/B.csv")

Txt file to excel conversion in python

I'm trying to convert text file to excel sheet in python. The txt file contains data in the below specified formart
Column names: reg no, zip code, loc id, emp id, lastname, first name. Each record has one or more error numbers. Each record have their column names listed above the values. I would like to create an excel sheet containing reg no, firstname, lastname and errors listed in separate rows for each record.
How can I put the records in excel sheet ? Should I be using regular expressions ? And how can I insert error numbers in different rows for that corresponding record?
Expected output:
Here is the link to the input file:
https://github.com/trEaSRE124/Text_Excel_python/blob/master/new.txt
Any code snippets or suggestions are kindly appreciated.
Here is a draft code. Let me know if any changes needed:
# import pandas as pd
from collections import OrderedDict
from datetime import date
import csv
with open('in.txt') as f:
with open('out.csv', 'wb') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
#Remove inital clutter
while("INPUT DATA" not in f.readline()):
continue
header = ["REG NO", "ZIP CODE", "LOC ID", "EMP ID", "LASTNAME", "FIRSTNAME", "ERROR"]; data = list(); errors = list()
spamwriter.writerow(header)
print header
while(True):
line = f.readline()
errors = list()
if("END" in line):
exit()
try:
int(line.split()[0])
data = line.strip().split()
f.readline() # get rid of \n
line = f.readline()
while("ERROR" in line):
errors.append(line.strip())
line = f.readline()
spamwriter.writerow(data + errors)
spamwriter.flush()
except:
continue
# while(True):
# line = f.readline()
Use python-2 to run. The errors are appended as subsequent columns. It's slightly complicated the way you want it. I can fix it if still needed
Output looks like:
You can do this using the openpyxl library which is capable of depositing items directly into a spreadsheet. This code shows how to do that for your particular situation.
NEW_PERSON, ERROR_LINE = 1,2
def Line_items():
with open('katherine.txt') as katherine:
for line in katherine:
line = line.strip()
if not line:
continue
items = line.split()
if items[0].isnumeric():
yield NEW_PERSON, items
elif items[:2] == ['ERROR', 'NUM']:
yield ERROR_LINE, line
else:
continue
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws['A2'] = 'REG NO'
ws['B2'] = 'LASTNAME'
ws['C2'] = 'FIRSTNAME'
ws['D2'] = 'ERROR'
row = 2
for kind, data in Line_items():
if kind == NEW_PERSON:
row += 2
ws['A{:d}'.format(row)] = int(data[0])
ws['B{:d}'.format(row)] = data[-2]
ws['C{:d}'.format(row)] = data[-1]
first = True
else:
if first:
first = False
else:
row += 1
ws['D{:d}'.format(row)] = data
wb.save(filename='katherine.xlsx')
This is a screen snapshot of the result.

cx_oracle reading from CSV

I have a cx_oracle connection and I am looking to run a 'batch' of sorts trying to gather ids from last names from a CSV file. Below I have my code, in which I am getting a cx_Oracle.DatabaseError: ORA-01756: quoted string not properly terminated error.
It is pointing to the line
and spriden_change_ind is null'''.format(lname,fname)
However I know this is working as you will see my commented code uses the format in this way and it works just fine. The rows_to_dict_list is a nice function I found here sometime ago to basically add the column names to the output.
Any direction would be nice! thank you
import csv, cx_Oracle
def rows_to_dict_list(cursor):
columns = [i[0] for i in cursor.description]
new_list = []
for row in cursor:
row_dict = dict()
for col in columns:
row_dict[col] = row[columns.index(col)]
new_list.append(row_dict)
return new_list
connection = cx_Oracle.connect('USERNAME','PASSWORD','HOSTNAME:PORTNUMBER/SERVICEID')
cur = connection.cursor()
printHeader = True
with open('nopnumber_names.csv')as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
lname = row['Last']
fname = row['First']
cur.execute('''select spriden_pidm as PIDM,
spriden_last_name as Last,
spriden_first_name as First,
spriden_mi as Middle,
spriden_ID as ID
from spriden
where upper(spriden_last_name) = '{0}'
and upper(spriden_first_name) = '{1}'
and spriden_change_ind is null'''.format(lname,fname)
)
# THIS RECORD RUNS FINE
# cur.execute('''select spriden_pidm as PIDM,
# spriden_ID as ID,
# spriden_last_name as Last,
# spriden_first_name as First
# from spriden
# where spriden_pidm = '{}'
# and spriden_change_ind is null'''.format(99999)
# )
data = rows_to_dict_list(cur)
for row in data:
print row
cur.close()
connection.close()
My best guess is that a first name or surname somewhere in your CSV file has a ' character in it.
You really shouldn't be building SQL by concatenating strings or using string formatting. You are at risk of SQL injection if you do so. What happens if someone has put a record in your CSV file with surname X' OR 1=1 --?
Instead, use bind parameters to send the values of your variables to the database. Try the following:
cur.execute('''select spriden_pidm as PIDM,
spriden_last_name as Last,
spriden_first_name as First,
spriden_mi as Middle,
spriden_ID as ID
from spriden
where upper(spriden_last_name) = :lname
and upper(spriden_first_name) = :fname
and spriden_change_ind is null''',
{"lname": lname, "fname": fname}
)

Categories

Resources