JSON to CSV converter-YELP Data Set-python - python

I am interested in data mining and I would like to open and work with yelp's data. Yelp's data is in json format and in it's website it has the following code to convert json to csv. However when I open command line and write the following:
$ python json_to_csv_converter.py yelp_academic_dataset.json
I get an error. Can you help me please?
The code is:
# -*- coding: utf-8 -*-
"""Convert the Yelp Dataset Challenge dataset from json format to csv.
For more information on the Yelp Dataset Challenge please visit http://yelp.com/dataset_challenge
"""
import argparse
import collections
import csv
import simplejson as json
def read_and_write_file(json_file_path, csv_file_path, column_names):
"""Read in the json dataset file and write it out to a csv file, given the column names."""
with open(csv_file_path, 'wb+') as fout:
csv_file = csv.writer(fout)
csv_file.writerow(list(column_names))
with open(json_file_path) as fin:
for line in fin:
line_contents = json.loads(line)
csv_file.writerow(get_row(line_contents, column_names))
def get_superset_of_column_names_from_file(json_file_path):
"""Read in the json dataset file and return the superset of column names."""
column_names = set()
with open(json_file_path) as fin:
for line in fin:
line_contents = json.loads(line)
column_names.update(
set(get_column_names(line_contents).keys())
)
return column_names
def get_column_names(line_contents, parent_key=''):
"""Return a list of flattened key names given a dict.
Example:
line_contents = {
'a': {
'b': 2,
'c': 3,
},
}
will return: ['a.b', 'a.c']
These will be the column names for the eventual csv file.
"""
column_names = []
for k, v in line_contents.iteritems():
column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
if isinstance(v, collections.MutableMapping):
column_names.extend(
get_column_names(v, column_name).items()
)
else:
column_names.append((column_name, v))
return dict(column_names)
def get_nested_value(d, key):
"""Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`.
Example:
d = {
'a': {
'b': 2,
'c': 3,
},
}
key = 'a.b'
will return: 2
"""
if '.' not in key:
if key not in d:
return None
return d[key]
base_key, sub_key = key.split('.', 1)
if base_key not in d:
return None
sub_dict = d[base_key]
return get_nested_value(sub_dict, sub_key)
def get_row(line_contents, column_names):
"""Return a csv compatible row given column names and a dict."""
row = []
for column_name in column_names:
line_value = get_nested_value(
line_contents,
column_name,
)
if isinstance(line_value, unicode):
row.append('{0}'.format(line_value.encode('utf-8')))
elif line_value is not None:
row.append('{0}'.format(line_value))
else:
row.append('')
return row
if __name__ == '__main__':
"""Convert a yelp dataset file from json to csv."""
parser = argparse.ArgumentParser(
description='Convert Yelp Dataset Challenge data from JSON format to CSV.',
)
parser.add_argument(
'json_file',
type=str,
help='The json file to convert.',
)
args = parser.parse_args()
json_file = args.json_file
csv_file = '{0}.csv'.format(json_file.split('.json')[0])
column_names = get_superset_of_column_names_from_file(json_file)
read_and_write_file(json_file, csv_file, column_names)
Error I am getting in the command line:
Traceback (most recent call last):
File "json_to_csv_converter.py", line 122, in column_names=get_superset_of_column_names_from_file
File "json_to_csv_converter.py", line 25, in get_superset_of_column_names_from_file
for line in fin:
File "C:\Users\Bengi\Appdata\Local\Programs\Python\Python35-32\lib\encodings\cp1252.py" line 23, in decode
return codecs.charmap_decode(input, self_errors,decoding_table)[0]
Unicode Decode Error: 'charmap' codec cant decode byte 0X9d in position 1102: character maps to

Do you have file encoding problem. You should put encoding='utf8' after json file open functions like: with open(json_file_path, encoding='utf8') as fin:

Judging by the error message there appears to be something wrong with your input file. It looks like json_to_csv_converter.py has determined that the file encoding is Windows 1252, but there is one or more invalid characters in the file, namely '\x9d', which is not a valid 1252 code point.
Check that your file is properly encoded. I'd guess that the file is UTF8 encoded, but for some reason it is being processed as if it was Windows 1252. Did you edit the file?

Winzip seems to mangle it somehow. I worked around this by:
Using 7-Zip to extract the tar file.
Editing the script to force use of UTF-8 encoding, like so:
with open(json_file_path, encoding='utf8') as fin:

Related

Convert complex nested JSON to csv using python

I got a complex nested JSON file and I need to convert it completely, but my code can only convert part of the information,
How can I modify my code, or do you guys have a better code?
my json file
import csv
import json
import sys
import codecs
def trans(path):
jsonData = codecs.open('H://14.json', 'r', 'utf-8')
# csvfile = open(path+'.csv', 'w')
# csvfile = open(path+'.csv', 'wb')
csvfile = open('H://11.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(csvfile, delimiter=',')
flag = True
for line in jsonData:
dic = json.loads(line)
if flag:
keys = list(dic.keys())
print(keys)
writer.writerow(keys)
flag = False
writer.writerow(list(dic.values()))
jsonData.close()
csvfile.close()
if __name__ == '__main__':
path=str(sys.argv[0])
print(path)
trans(path)
my json file
{"id":"aa","sex":"male","name":[{"Fn":"jeri","Ln":"teri"}],"age":45,"info":[{"address":{"State":"NY","City":"new york"},"start_date":"2001-09","title":{"name":"Doctor","Exp":"head"},"year":"2001","month":"05"}],"other":null,"Hobby":[{"smoking":null,"gamble":null}],"connect":[{"phone":"123456789","email":"info#gmail.com"}],"Education":"MBA","School":{"State":"NY","City":"new york"}}
{"id":"aa","sex":"female","name":[{"Fn":"lo","Ln":"li"}],"age":34,"info":[{"address":{"State":"NY","City":"new york"},"start_date":"2008-11","title":{"name":"Doctor","Exp":"hand"},"year":"2008","month":"02"}],"other":null,"Hobby":[{"smoking":null,"gamble":null}],"connect":[{"phone":"123456789","email":"info#gmail.com"}],"Education":"MBA","School":{"State":"NY","City":"new york"}}
It only converts part of the information, 'name''info''Hobby''connect''School' these information are not converted,I need to convert all information completely,
You could use the below function to treat each dic. It will flatten the dict by recursive calls until there is no dict or list in the values. In order to avoid issues with 2 keys having the same name, I concatenate with the previous level.
WARNING: it is based on your format so if you have lists with more than one element in the middle, it will only take care of the first element.
def flatten_dict(input_dict, result = None):
result = result or {}
for key, value in input_dict.items():
if isinstance(value, list):
current_dict = {key+"_"+k: v for k, v in value[0].items()}
flatten_dict(current_dict, result)
elif isinstance(value, dict):
current_dict = {key+"_"+k: v for k, v in value.items()}
flatten_dict(current_dict, result)
else:
result[key] = value
return result
Then apply this function on each dic, transform to Dataframe and save as CSV.
res = []
for line in jsonData:
dic = json.loads(line)
res.append(flatten_dict(dic))
res_df = pd.DataFrame(res)
res_df.to_csv("result.csv")
Result:

Replace values from YAML in JSON file - Python 3

I want to replace the values from YAML file into a JSON file using Python 3.
I have many JSON files where I want to get the values from a master YAML file and replace only certain values like source server ip, email, hostname.
E.g. I have this YAML file (mast_conf.yaml):
- sourcesystem1:
sourceServer: 1.2.3.500
MailTo: gokul#gmail.com
- sourcesystem2:
sourceServer1: 2.2.3.500
sourceServer2: 3.2.3.500
MailTo: gokul#gmail.com
A JSON file (sourcesystem1.json):
{
"source":"sourcesystem1",
"frequency":"daily",
"sourceServer":"1.2.1.2",
"hostName":"1.2.1.3",
"fileFormat":"csv",
"delimiterType":"semicolon"
}
Another JSON file (sourcesystem2.json):
{
"source":"sourcesystem2",
"frequency":"daily",
"sourceServer":"1.2.3.2",
"hostName":"1.2.1.7",
"fileFormat":"csv",
"delimiterType":"commaseperated"
}
Below is my code I am trying out to parse the value from the json file
import json
import yaml
with open("master_conf.yaml", 'r') as f:
yaml_config = yaml.safe_load(f)
yaml_config = {
list(config.keys()[0]): list(config[config.keys()[0]])
for config in yaml_config
}
json_files = ( "sourcesystem1.json",
"sourcesystem2.json",
)
for json_file in json_files:
with open(json_file, "r") as f:
sourcesystem_conf = json.load(f)
sourcesystem = sourcesystem_conf["source"]
if sourcesystem in yaml_config:
for key, value in yaml_config[sourcesystem].items():
sourcesystem_conf[key] = value
with open(json_file, "w") as f:
json.dump(sourcesystem_conf, f, indent=2)
I am getting the below error by program
TypeError: 'dict_keys' object does not support indexing
When I run indivudually I get this issue for yaml
>>> yaml_config = { ... config.keys()[0]: config[config.keys()[0]] ... for config in yaml_config ... } Traceback (most recent call last): File "<stdin>", line 3, in <module> File "<stdin>", line 3, in <dictcomp> TypeError: 'dict_keys' object is not subscriptable >>>
Is there easier method to achieve my end goal where I want to replace the values in the JSON file from the Yaml configuration file
This is needed to update 1000s of Json file in a automated way for updating it from a master Yaml file
The easiest way is to use pyyaml, see Jon's answer.
Then you can load you yaml file using it :
>>> import yaml
>>> yaml_config = yaml.safe_load(yaml_file)
>>> yaml_config
[{'sourcesystem1': {'MailTo': 'gokul#gmail.com', 'sourceServer': '1.2.3.500'}},
{'sourcesystem2': {'MailTo': 'gokul#gmail.com',
'sourceServer1': '2.2.3.500',
'sourceServer2': '3.2.3.500'}}]
It will be easier to manipulate a dict with source systems as keys.
In python 2 aDict.keys() returns a list so the following will work :
>>> yaml_config = {
config.keys()[0]: config[config.keys()[0]]
for config in yaml_config
}
>>> yaml_config
{'sourcesystem1': {'MailTo': 'gokul#gmail.com', 'sourceServer': '1.2.3.500'},
'sourcesystem2': {'MailTo': 'gokul#gmail.com',
'sourceServer1': '2.2.3.500',
'sourceServer2': '3.2.3.500'}}
In python 3 aDict.keys() no longer returns a list so you can simply use a for loop :
yaml_config = {}
for config in yaml_config_raw:
source = [key for key in config][0]
yaml_config[source] = config[source]
Then you can just iterate over your json files to update them :
import json
import yaml
with open("mast_conf.yaml", 'r') as f:
yaml_config_raw = yaml.safe_load(f)
yaml_config = {}
for config in yaml_config_raw:
source = [key for key in config][0]
yaml_config[source] = config[source]
json_files = (
"sourcesystem1.json",
"sourcesystem2.json",
)
for json_file in json_files:
with open(json_file, "r") as f:
sourcesystem_conf = json.load(f)
sourcesystem = sourcesystem_conf["source"]
if sourcesystem in yaml_config:
for key, value in yaml_config[sourcesystem].items():
sourcesystem_conf[key] = value
with open(json_file, "w") as f:
json.dump(sourcesystem_conf, f, indent=2)

How to open a csv file for reading purpose using mmap in python?

I want to open csv file for reading purpose. But I'm facing some exceptions regarding to that.
I'm using Python 2.7.
main.python-
if __name__ == "__main__":
f = open('input.csv','r+b')
m = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
reader = csv.DictReader(iter(m.readline, ""))
for read in reader:
num = read['time']
print num
output-
Traceback (most recent call last):
File "/home/PycharmProjects/time_gap_Task/main.py", line 22, in <module>
for read in reader:
File "/usr/lib/python3.4/csv.py", line 109, in __next__
self.fieldnames
File "/usr/lib/python3.4/csv.py", line 96, in fieldnames
self._fieldnames = next(self.reader)
_csv.Error: iterator should return strings, not bytes (did you open the file in text mode?)
How to resolve this error? and how to open csv file using mmap and csv in good manner so code is working perfect?
I know you asked this a while ago, but I actually created a module for myself that does this, because I do a lot of work with large CSV files, and sometimes I need to convert them into dictionaries, based on a key. Below is the code I've been using. Please feel free to modify as needed.
def MmapCsvFileIntoDict(csvFilePath, skipHeader = True, transform = lambda row: row, keySelector = lambda o: o):
"""
Takes a CSV file path and uses mmap to open the file and return a dictionary of the contents keyed
on the results of the keySelector. The default key is the transformed object itself. Mmap is used because it is
a more efficient way to process large files.
The transform method is used to convert the line (converted into a list) into something else. Hence 'transform'.
If you don't pass it in, the transform returns the list itself.
"""
contents = {}
firstline = False
try:
with open(csvFilePath, "r+b") as f:
# memory-map the file, size 0 means whole file
mm = mmap.mmap(f.fileno(), 0)
for line in iter(mm.readline, b''):
if firstline == False:
firstline = True
if skipHeader == True:
continue
row = ''
line = line.decode('utf-8')
line = line.strip()
row = next(csv.reader([line]), '')
if transform != None and callable(transform):
if row == None or row == '':
continue
value = transform(row)
else:
value = row
if callable(keySelector):
key = keySelector(value)
else:
key = keySelector
contents[key] = value
except IOError as ie:
PrintWithTs('Error decomposing the companies: {0}'.format(ie))
return {}
except:
raise
return contents
When you call this method, you have some options.
Assume you have a file that looks like:
Id, Name, PhoneNumber
1, Joe, 7175551212
2, Mary, 4125551212
3, Vince, 2155551212
4, Jane, 8145551212
The easiest way to call it is like this:
dict = MmapCsvFileIntoDict('/path/to/file.csv', keySelector = lambda row: row[0])
What you get back is a dict looking like this:
{ '1' : ['1', 'Joe', '7175551212'], '2' : ['2', 'Mary', '4125551212'] ...
One thing I like to do is create a class or a namedtuple to represent my data:
class CsvData:
def __init__(self, row):
self.Id = int(row[0])
self.Name = row[1].upper()
self.Phone = int(row[2])
And then when I call the method, I pass in a second lambda to transform each row in the file to an object I can work with:
dict = MmapCsvFileIntoDict('/path/to/file.csv', transform = lambda row: CsvData(row), keySelector = lambda o: o.Id)
What I get back that time looks like:
{ 1 : <object instance>, 2 : <object instance>...
I hope this helps! Best of luck
When open a file with the flag b like this:
f = open('input.csv','r+b')
You read the file as bytes and not as string.
So, try to change the flags to r:
f = open('input.csv','r')
if you just want to read data with specific columnes from csv file, just try:
import csv
with open('input.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
print row['time']

convert the following json to csv using python

{"a":"1","b":"1","c":"1"}
{"a":"2","b":"2","c":"2"}
{"a":"3","b":"3","c":"3"}
{"a":"4","b":"4","c":"4"}
I have tried the following code but it gives error:-
from nltk.twitter import Twitter
from nltk.twitter.util import json2csv
with open('C:/Users/Archit/Desktop/raw_tweets.json', 'r') as infile:
# Variable for building our JSON block
json_block = []
for line in infile:
# Add the line to our JSON block
json_block.append(line)
# Check whether we closed our JSON block
if line.startswith('{'):
# Do something with the JSON dictionary
json2csv(json_block, 'tweets.csv', ['id','text','created_at','in_reply_to_user_id','in_reply_to_screen_name','in_reply_to_status_id','user.id','user.screen_name','user.name','user.location','user.friends_count','user.followers_count','source'])
# Start a new block
json_block = []
Error:
File "C:\Python34\lib\json\decoder.py", line 361, in raw_decode
raise ValueError(errmsg("Expecting value", s, err.value)) from None
ValueError: Expecting value: line 1 column 1 (char 0)
import csv, json
data = []
with open('C:\Users\Shahriar\Desktop\T.txt') as data_file:
for line in data_file:
data.append(json.loads(line))
keys = data[0].keys()
with open('data.csv', 'wb') as csvF:
csvWriter = csv.DictWriter(csvF, fieldnames=keys)
csvWriter.writeheader()
for d in data:
csvWriter.writerow(d)
Output:
a,c,b
1,1,1
2,2,2
3,3,3
4,4,4
This is way too late but I also stumbled upon some errors today. I figured that you actually have to import from nltk.twitter.common instead of util. Hope this helps others who stumbled upon this thread
# Read json
filename = 'C:/Users/Archit/Desktop/raw_tweets.json'
lines = [line.replace("{", "").replace("}", "").replace(":", ",") for line in open(filename)]
# Write csv
with open('out.csv', 'w') as csv_file:
for line in lines:
csv_file.write("%s\n" % line)

Python CSV to JSON creating an error with symbols for example £

I'm having a problem with reading/using csv files that have symbols in them in this case it's the symbol £
Running the following code:
import csv
import json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("A", "B", "C")
reader = csv.DictReader(csvfile, fieldnames)
for row in reader:
print row
json.dump(row, jsonfile)
jsonfile.write('\n')
On the data
A,B,C,D,E,F,
10:00,LOC,1-10,£500.00,4+,DATA,
10:00,LOC,1-10,£500.00,2+,MORE,
10:10,LOC2,1-1,£9000,39+,DATA,
10:10,LOC2,1-3,£500,22+,MORE,
10:10,LOC2,1-5,£500,11+,DATA,
Prints:
{'A': 'A', None: ['D', 'E', 'F', ''], 'C': 'C', 'B': 'B'}
{'A': '10:00', None: ['\xa3500.00', '4+', 'DATA', ''], 'C': '1-10', 'B': 'LOC'}
Traceback (most recent call last):
File "C:\testscript.py", line 12, in <module>
json.dump(row, jsonfile)
File "C:\Python27\lib\json\__init__.py", line 189, in dump
for chunk in iterable:
File "C:\Python27\lib\json\encoder.py", line 434, in _iterencode
for chunk in _iterencode_dict(o, _current_indent_level):
File "C:\Python27\lib\json\encoder.py", line 408, in _iterencode_dict
for chunk in chunks:
File "C:\Python27\lib\json\encoder.py", line 313, in _iterencode_list
yield buf + _encoder(value)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xa3 in position 0: invalid start byte
I do eventually want to parse columns that have symbols like the £ pound sign through the json module outputting them into the json file.
How can I get around this problem so that that the error doesn't come up...
Using chfw's code and getting around \u00a
# -*- coding: Latin-1 -*-
import csv
import json
import codecs
import StringIO
fieldnames = ("A", "B", "C")
### code insertion start ###
class Iterator(object):
def next(self):
return type(self).__next__(self)
class UTF8Recorder(Iterator):
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8.
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def __next__(self):
return next(self.reader).encode('utf-8')
### code insertion stop ###
with open('DATA.csv', 'rb') as csvfile:
reader = csv.DictReader(UTF8Recorder(csvfile, 'iso-8859-1'), fieldnames)
stored_output = StringIO.StringIO()
for row in reader:
print row
json.dump(row, stored_output)
stored_output.write('\n')
contents = stored_output.getvalue().strip()
with open('DATA.json', 'w') as jsonfile:
jsonfile.write(contents.replace("\u00a", "£"))
Here is the information on the encoding of pound sign. I guess you had "iso-8859-1". Hence my solution is:
import csv
import json
import codecs
csvfile = open('utf.csv', 'rb')
jsonfile = open('file.json', 'w')
fieldnames = ("A", "B", "C")
### code insertion start ###
class Iterator(object):
def next(self):
return type(self).__next__(self)
class UTF8Recorder(Iterator):
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8.
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def __next__(self):
return next(self.reader).encode('utf-8')
### code insertion stop ###
reader = csv.DictReader(UTF8Recorder(csvfile, 'iso-8859-1'), fieldnames)
for row in reader:
print row
jsonfile.write(row.__repr__()) # json.dump(row, jsonfile), <- updated
jsonfile.write('\n')
If the encoding is not 'iso-8859-1', please replace the one that you know in the UTF8Recorder argument.

Categories

Resources