Read csv starting with leading spaces - python

I have a comma-separated file (from a third party) in which each line starts and ends with a space, the fields are quoted with a doublequote, and the file ends with a line with only a space.
"first_name";"last_name"
"John";"Doe"
"Anita";"Doe"
I try to read this with the following code.
import csv
import json
def read_csv(filename):
result = []
with open(filename, 'r', encoding='utf-8') as f:
csv_reader = csv.reader(f, delimiter=';', quotechar='"')
for line_index, line in enumerate(csv_reader):
if line_index == 0:
header = line
continue
result.append(dict(zip(header, line)))
return result
if __name__ == '__main__':
contents = read_csv('test.txt')
print(json.dumps(contents, indent=4, sort_keys=4))
This is my expected result:
[
{
"first_name": "John",
"last_name ": "Doe "
},
{
"first_name": "Anita",
"last_name ": "Doe "
}
]
However, it always takes the doublequotes as part of the first column, due to the leading spaces, plus it takes the last line also into account. This is the result I get:
[
{
" \"first_name\"": " \"John\"",
"last_name ": "Doe "
},
{
" \"first_name\"": " \"Anita\"",
"last_name ": "Doe "
},
{
" \"first_name\"": " "
}
]
How can I get rid of these leading and trailing spaces before the csv is parsed? The answer here shows how to remove spaces from fields after it is read, but that wouldn't be good here, since it's not the contents of the fields that I want to change, but the fields themselves.
By the way: I am using Python 3.5.
EDIT
I am skipping empty lines now using the following code:
# Skip empty lines
line = [column.strip() for column in line]
if not any(line):
continue

You can use skipinitialspace=True and use a csv.DictReader (which assumes the first row is a header and creates a dict for you of name->value instead of manually doing it yourself) instead, eg:
with open(filename) as fin:
csvin = csv.DictReader(fin, delimiter=';', skipinitialspace=True)
result = list(csvin)
Alternatively, if only rows with some value should be considered (ie, the last row with no values, or even iterim blanks row should be filtered out), you can use:
result = [row for row in csvin if any(row.values())]
Which'll give you:
[{'first_name': 'John', 'last_name ': 'Doe '},
{'first_name': 'Anita', 'last_name ': 'Doe '}]
And the result of that using json.dumps(result, indent=4, sort_keys=4)) is:
[
{
"first_name": "John",
"last_name ": "Doe "
},
{
"first_name": "Anita",
"last_name ": "Doe "
}
]

Related

How to fix the output for converting to JSON

I wrote a code in python that converts a file with these objects to JSON. It converts into the proper json format but the output is not exactly what I need.
{
name: (sindey, crosby)
game: "Hockey"
type: athlete
},
{
name: (wayne, gretzky)
game: "Ice Hockey"
type: athlete
}
Code:
import json
f = open("log.file", "r")
content = f.read()
splitcontent = content.splitlines()
d = []
for line in splitcontent:
appendage = {}
if ('}' in line) or ('{' in line):
# Append a just-created record and start a new one
continue
d.append(appendage)
key, val = line.split(':')
if val.endswith(','):
# strip a trailing comma
val = val[:-1]
appendage[key] = val
with open("json_log.json", 'w') as file:
file.write((json.dumps(d, indent=4, sort_keys=False)))
Desired output:
[
{
"name": "(sindey, crosby)",
"game": "Hockey",
"type": "athlete"
},
{
"name": "(wayne, gretzky)",
"game": "Ice Hockey",
"type": "athlete"
}
]
But I'm getting:
[
{
" name": " (sindey, crosby)"
},
{
" game": " \"Hockey\""
},
{
" type": " athlete"
},
{
" name": " (wayne, gretzky)"
},
{
" game": " \"Ice Hockey\""
},
{
" type": " athlete"
}
]
Any way to fix it to get the desired output and fix the {} around each individual line?
It's usually a good idea to split parsing into simpler tasks, e.g. first parse records, then parse fields.
I'm skipping the file handling and using a text variable:
intxt = """
{
name: (sindey, crosby)
game: "Hockey"
type: athlete
},
{
name: (wayne, gretzky)
game: "Ice Hockey"
type: athlete
}
"""
Then create a function that can yield all lines that are part of a record:
import json
def parse_records(txt):
reclines = []
for line in txt.split('\n'):
if ':' not in line:
if reclines:
yield reclines
reclines = []
else:
reclines.append(line)
and a function that takes those lines and parses each key/value pair:
def parse_fields(reclines):
res = {}
for line in reclines:
key, val = line.strip().rstrip(',').split(':', 1)
res[key.strip()] = val.strip()
return res
the main function becomes trivial:
res = []
for rec in parse_records(intxt):
res.append(parse_fields(rec))
print(json.dumps(res, indent=4))
the output, as desired:
[
{
"name": "(sindey, crosby)",
"game": "\"Hockey\"",
"type": "athlete"
},
{
"name": "(wayne, gretzky)",
"game": "\"Ice Hockey\"",
"type": "athlete"
}
]
The parsing functions can of course be made better, but you get the idea.
Yes I haven't checked the ouput properly, I remodified the logic now. The output is as expected.
import json
f = open("log.file", "r")
content = f.read()
print(content)
splitcontent = content.splitlines()
d = []
for line in splitcontent:
if "{" in line:
appendage = {}
elif "}" in line:
d.append(appendage)
else:
key, val = line.split(':')
appendage[key.strip()] = val.strip()
with open("json_log.json", 'w') as file:
file.write((json.dumps(d, indent=4, sort_keys=False)))

Convert excel to json but not having the expected format

I have this code in python that turns excel into json but the output is not the format that I'm expected to:
import pandas as pd
import json
import numpy as np
data = pd.read_excel('/home/bird/Downloads/file.xlsx', sheet_name='sheet1')
outpath = r"home/bird/Downloads/data.json"
plant_id= data.id
name = data.name
description = data.description
water=data.w
sun=data.s
container = {}
x = 0
while x< len(plant_id):
container[plant_id[x]]= [
{"plant_id: ":plant_id[x],
"name: ": name[x],
"description: ": description[x],
"health:": {"water": water[x], "sun":sun[x]},
},
]
x=x+1
df = pd.DataFrame(container)
df.to_json(outpath, indent=4)
I have the following output:
{
"pl-01":{
"0":{
"plant_id: ":"pl-01",
"name: ":"corn",
"description: ":"yadayadayada",
"health":{
"water":"30%",
"sun":"5%"
}
}
},
...
}
What I want is slightly different :
{
{
"plant_id: ":"pl-01",
"name: ":"corn",
"description: ":"yadayadayada",
"health":{
"water":"30%",
"sun":"5%",
}
},
...
}
I have colors in my excel cells (green/yellow...) for the exposure column, how do I insert that in my json file in order to get a new field -> "color": "green" (for example) please ?
import json
container = {}
for x in range(len(plant_id)):
container[plant_id[x]] = {"plant_id: ":plant_id[x],
"name: ": name[x],
"description: ": description[x],
"health:": {"water": water[x], "sun":sun[x]},
"color": exposure[x]}
with open(outpath, 'w') as outfile:
json.dump(container, outfile)
When creating each entry in the container you can add any field you like, here I added a field "color" with the value "green" or "red" depending on the value of exposure at index x.
Note that you have extra characters in the field names that are probably unnecessary: "plant_id: " should be "plant_id", "name: " should be "name" etc.
If you want to not have a dictionary in your output (as per your example):
with open(outpath, 'w') as outfile:
json.dump(set(container.values()), outfile)

Adding a comma between JSON objects in a datafile with Python?

I have large file (about 3GB) which contains what looks like a JSON file but isn't because it lacks commas (,) between "observations" or JSON objects (I have about 2 million of these "objects" in my data file).
For example, this is what I have:
{
"_id": {
"$id": "fh37fc3huc3"
},
"messageid": "4757724838492485088139042828",
"attachments": [],
"usernameid": "47284592942",
"username": "Alex",
"server": "475774810304151552",
"text": "Must watch",
"type": "462050823720009729",
"datetime": "2018-08-05T21:20:20.486000+00:00",
"type": {
"$numberLong": "0"
}
}
{
"_id": {
"$id": "23453532dwq"
},
"messageid": "232534",
"attachments": [],
"usernameid": "273342",
"usernameid": "Alice",
"server": "475774810304151552",
"text": "https://www.youtube.com/",
"type": "4620508237200097wd29",
"datetime": "2018-08-05T21:20:11.803000+00:00",
"type": {
"$numberLong": "0"
}
And this is what I want (the comma between "observations"):
{
"_id": {
"$id": "fh37fc3huc3"
},
"messageid": "4757724838492485088139042828",
"attachments": [],
"username": "Alex",
"server": "475774810304151552",
"type": {
"$numberLong": "0"
}
},
{
"_id": {
"$id": "23453532dwq"
},
"messageid": "232534",
"attachments": [],
"usernameid": "Alice",
"server": "475774810304151552",
"type": {
"$numberLong": "0"
}
This is what I tried but it doesn't give me a comma where I need it:
import re
with open('dataframe.txt', 'r') as input, open('out.txt', 'w') as output:
output.write("[")
for line in input:
line = re.sub('', '},{', line)
output.write(' '+line)
output.write("]")
What can I do so that I can add a comma between each JSON object in my datafile?
This solution presupposes that none of the fields in JSON contains neither { nor }.
If we assume that there is at least one blank line between JSON dictionaries, an idea: let's maintain unclosed curly brackets count ({) as unclosed_count; and if we meet an empty line, we add the coma once.
Like this:
with open('test.json', 'r') as input_f, open('out.json', 'w') as output_f:
output_f.write("[")
unclosed_count = 0
comma_after_zero_added = True
for line in input_f:
unclosed_count_change = line.count('{') - line.count('}')
unclosed_count += unclosed_count_change
if unclosed_count_change != 0:
comma_after_zero_added = False
if line.strip() == '' and unclosed_count == 0 and not comma_after_zero_added:
output_f.write(",\n")
comma_after_zero_added = True
else:
output_f.write(line)
output_f.write("]")
Assuming sufficient memory, you can parse such a stream one object at a time using json.JSONDecoder.raw_decode directly, instead of using json.loads.
>>> x = '{"a": 1}\n{"b": 2}\n' # Hypothetical output of open("dataframe.txt").read()
>>> decoder = json.JSONDecoder()
>>> x = '{"a": 1}\n{"b":2}\n'
>>> decoder.raw_decode(x)
({'a': 1}, 8)
>>> decoder.raw_decode(x, 9)
({'b': 2}, 16)
The output of raw_decode is a tuple containing the first JSON value decoded and the position in the string where the remaining data starts. (Note that json.loads just creates an instance of JSONDecoder, and calls the decode method, which just calls raw_decode and artificially raises an exception if the entire input isn't consumed by the first decoded value.)
A little extra work is involved; note that you can't start decoding with whitespace, so you'll have to use the returned index to detect where the next value starts, following any additional whitespace at the returned index.
Another way to view your data is that you have multiple json records separated by whitespace. You can use the stdlib JSONDecoder to read each record, then strip whitespace and repeat til done. The decoder reads a record from a string and tells you how far it got. Apply that iteratively to the data until all is consumed. This is far less risky than making a bunch of assumptions about what data is contained in the json itself.
import json
def json_record_reader(filename):
with open(filename, encoding="utf-8") as f:
txt = f.read().lstrip()
decoder = json.JSONDecoder()
result = []
while txt:
data, pos = decoder.raw_decode(txt)
result.append(data)
txt = txt[pos:].lstrip()
return result
print(json_record_reader("data.json"))
Considering the size of your file, a memory mapped text file may be the better option.
If you're sure that the only place you will find a blank line is between two dicts, then you can go ahead with your current idea, after you fix its execution. For every line, check if it's empty. If it isn't, write it as-is. If it is, write a comma instead
with open('dataframe.txt', 'r') as input_file, open('out.txt', 'w') as output_file:
output_file.write("[")
for line in input_file:
if line.strip():
output_file.write(line)
else:
output_file.write(",")
output_file.write("]")
If you cannot guarantee that any blank line must be replaced by a comma, you need a different approach.
You want to replace a close-bracket, followed by an empty line (or multiple whitespace), followed by an open-bracket, with },{.
You can keep track of the previous two lines in addition to the current line, and if these are "}", "", and "{" in that order, then write a comma before writing the "{".
from collections import deque
with open('dataframe.txt', 'r') as input_file, open('out.txt', 'w') as output_file:
last_two_lines = deque(maxlen=2)
output_file.write("[")
for line in input_file:
line_s = line.strip()
if line_s == "{" and list(last_two_lines) == ["}", ""]:
output_file.write("," + line)
else:
output_file.write(line)
last_two_lines.append(line_s)
Alternatively, if you want to stick with regex, then you could do
with open('dataframe.txt') as input_file:
file_contents = input_file.read()
repl_contents = re.sub(r'\}(\s+)\{', r'},\1{', file_contents)
with open('out.txt', 'w') as output_file:
output_file.write(repl_contents)
Here, the regex r"\}(\s+)\{" matches the pattern we're looking for (\s+ matches multiple whitespace characters, and captures them in group 1, which we then use in the replacement string as \1.
Note that you will need to read and run re.sub on the entire file, which will be slow.

converting text file to json in python

I have multiple documents that together are approximately 400 GB and I want to convert them to json format in order to drop to elasticsearch for analysis.
Each file is approximately 200 MB.
Original file looked like:
IUGJHHGF#BERLIN:lhfrjy
0t7yfudf#WARSAW:qweokm246
0t7yfudf#CRACOW:Er747474
0t7yfudf#cracow:kui666666
000t7yf#Vienna:1йй2ц2й2цй2цц3у
It has the characters that are not only English. key1 is always separated with #, where city was separated either by ; or :
After I have parsed it with code:
#!/usr/bin/env python
# coding: utf8
import json
with open('2') as f:
for line in f:
s1 = line.find("#")
rest = line[s1+1:]
if rest.find(";") != -1:
if rest.find(":") != -1:
print "FOUND BOTH : ; "
s2 = -0
else:
s2 = s1+1+rest.find(";")
elif rest.find(":") != -1:
s2 = s1+1+rest.find(":")
else:
print "FOUND NO : ; "
s2 = -0
key1 = line[:s1]
city = line[s1+1:s2]
description = line[s2+1:len(line)-1]
All file looks like:
RRS12345 Cracow Sunflowers
RRD12345 Berin Data
After that parsing I want to have the output:
{
"location_data":[
{
"key1":"RRS12345",
"city":"Cracow",
"description":"Sunflowers"
},
{
"key1":"RRD123dsd45",
"city":"Berlin",
"description":"Data"
},
{
"key1":"RRD123dsds45",
"city":"Berlin",
"description":"1йй2ц2й2цй2цц3у"
}
]
}
How can I convert it to the required json format quickly, where we do not have only English characters?
import json
def process_text_to_json():
location_data = []
with open("file.txt") as f:
for line in f:
line = line.split()
location_data.append({"key1": line[0], "city": line[1], "description": line[2]})
location_data = {"location_data": location_data}
return json.dumps(location_data)
Output sample:
{"location_data": [{"city": "Cracow", "key1": "RRS12345", "description": "Sunflowers"}, {"city": "Berin", "key1": "RRD12345", "description": "Data"}, {"city": "Cracow2", "key1": "RRS12346", "description": "Sunflowers"}, {"city": "Berin2", "key1": "RRD12346", "description": "Data"}, {"city": "Cracow3", "key1": "RRS12346", "description": "Sunflowers"}, {"city": "Berin3", "key1": "RRD12346", "description": "Data"}]}
Iterate over each line and form your dict.
Ex:
d = {"location_data":[]}
with open(filename, "r") as infile:
for line in infile:
val = line.split()
d["location_data"].append({"key1": val[0], "city": val[1], "description": val[2]})
print(d)

How to read a large JSON file using Python ijson?

I am trying to parse a big json file (hundreds of gigs) to extract information from its keys. For simplicity, consider the following example:
import random, string
# To create a random key
def random_string(length):
return "".join(random.choice(string.lowercase) for i in range(length))
# Create the dicitonary
dummy = {random_string(10): random.sample(range(1, 1000), 10) for times in range(15)}
# Dump the dictionary into a json file
with open("dummy.json", "w") as fp:
json.dump(dummy, fp)
Then, I use ijson in python 2.7 to parse the file:
file_name = "dummy.json"
with open(file_name, "r") as fp:
for key in dummy.keys():
print "key: ", key
parser = ijson.items(fp, str(key) + ".item")
for number in parser:
print number,
I was expecting to retrieve all the numbers in the lists corresponding to the keys of the dic. However, I got
IncompleteJSONError: Incomplete JSON data
I am aware of this post: Using python ijson to read a large json file with multiple json objects, but in my case I have a single json file, that is well formed, with a relative simple schema. Any ideas on how can I parse it? Thank you.
ijson has an iterator interface to deal with large JSON files allowing to read the file lazily. You can process the file in small chunks and save results somewhere else.
Calling ijson.parse() yields three values prefix, event, value
Some JSON:
{
"europe": [
{"name": "Paris", "type": "city"},
{"name": "Rhein", "type": "river"}
]
}
Code:
import ijson
data = ijson.parse(open(FILE_PATH, 'r'))
for prefix, event, value in data:
if event == 'string':
print(value)
Output:
Paris
city
Rhein
river
Reference: https://pypi.python.org/pypi/ijson
The sample json content file is given below: it has records of two people. It might as well have 2 million records.
[
{
"Name" : "Joy",
"Address" : "123 Main St",
"Schools" : [
"University of Chicago",
"Purdue University"
],
"Hobbies" : [
{
"Instrument" : "Guitar",
"Level" : "Expert"
},
{
"percussion" : "Drum",
"Level" : "Professional"
}
],
"Status" : "Student",
"id" : 111,
"AltID" : "J111"
},
{
"Name" : "Mary",
"Address" : "452 Jubal St",
"Schools" : [
"University of Pensylvania",
"Washington University"
],
"Hobbies" : [
{
"Instrument" : "Violin",
"Level" : "Expert"
},
{
"percussion" : "Piano",
"Level" : "Professional"
}
],
"Status" : "Employed",
"id" : 112,
"AltID" : "M112"
}
}
]
I created a generator which would return each person's record as a json object. The code would look like below. This is not the generator code. Changing couple of lines would make it a generator.
import json
curly_idx = []
jstr = ""
first_curly_found = False
with open("C:\\Users\\Rajeshs\\PycharmProjects\\Project1\\data\\test.json", 'r') as fp:
#Reading file line by line
line = fp.readline()
lnum = 0
while line:
for a in line:
if a == '{':
curly_idx.append(lnum)
first_curly_found = True
elif a == '}':
curly_idx.pop()
# when the right curly for every left curly is found,
# it would mean that one complete data element was read
if len(curly_idx) == 0 and first_curly_found:
jstr = f'{jstr}{line}'
jstr = jstr.rstrip()
jstr = jstr.rstrip(',')
jstr[:-1]
print("------------")
if len(jstr) > 10:
print("making json")
j = json.loads(jstr)
print(jstr)
jstr = ""
line = fp.readline()
lnum += 1
continue
if first_curly_found:
jstr = f'{jstr}{line}'
line = fp.readline()
lnum += 1
if lnum > 100:
break
You are starting more than one parsing iterations with the same file object without resetting it. The first call to ijson will work, but will move the file object to the end of the file; then the second time you pass the same.object to ijson it will complain because there is nothing to read from the file anymore.
Try opening the file each time you call ijson; alternatively you can seek to the beginning of the file after calling ijson so the file object can read your file data again.
if you are working with json with the following format you can use ijson.item()
sample json:
[
{"id":2,"cost":0,"test":0,"testid2":255909890011279,"test_id_3":0,"meeting":"daily","video":"paused"}
{"id":2,"cost":0,"test":0,"testid2":255909890011279,"test_id_3":0,"meeting":"daily","video":"paused"}
]
input = 'file.txt'
res=[]
if Path(input).suffix[1:].lower() == 'gz':
input_file_handle = gzip.open(input, mode='rb')
else:
input_file_handle = open(input, 'rb')
for json_row in ijson.items(input_file_handle,
'item'):
res.append(json_row)

Categories

Resources