Convert text file of dictionaries to csv Python - python

I have a text file composed by different dictionaries and it looks like this:
{"destination.fqdn": "194-65-57-128.ctt.pt", "feed.provider": "MyFeed", "source.abuse_contact": "coisas#foo.com", "raw": "bWFsd2FyZSwyMTAuMjguNTYuMSxodHRwOi8vd3d3LmN0dC5wdCAsMTk0LTY1LTU3LTEyOC5jdHQucHQsY29pc2FzQGZvby5jb20sMTk0LjIzOS4xNjcuNSx3d3cudmVyeWJhZC5jb20gLHZlcnkudmVyeWJhZC5jb20sLCwsMjAxMC0wMi0xOFQwMDowMDowMCswMDowMA0K", "feed.name": "FileCollector", "destination.geolocation.latitude": 32.2109, "destination.geolocation.cc": "CN", "source.geolocation.longitude": 12.069, "event_description.text": "ctt", "source.ip": "194.239.167.5", "source.geolocation.city": "Frederikssund", "destination.geolocation.city": "Zhenjiang", "destination.url": "http://www.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.verybad.com", "source.fqdn": "very.verybad.com", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "feed.accuracy": 100.0, "time.observation": "2017-07-18T13:15:48+00:00", "destination.geolocation.longitude": 119.4551, "source.geolocation.latitude": 55.8396, "classification.type": "malware", "destination.ip": "210.28.56.1", "time.source": "2010-02-18T00:00:00+00:00", "source.geolocation.cc": "DK"}
{"destination.url": "http://www2.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.telecom.pt", "feed.provider": "MyFeed", "time.observation": "2017-07-18T13:15:48+00:00", "destination.fqdn": "ctt-pt.mail.protection.outlook.com", "source.abuse_contact": "coisas7#foo.com", "source.geolocation.cc": "TN", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "raw": "YyZjLDI1MS4xNTQuNjUuOSxodHRwOi8vd3d3Mi5jdHQucHQsY3R0LXB0Lm1haWwucHJvdGVjdGlvbi5vdXRsb29rLmNvbSxjb2lzYXM3QGZvby5jb20sMTk3LjEzLjEwNS44LHd3dy50ZWxlY29tLnB0LCwsLCwyMDEwLTAyLTE4VDAwOjAwOjAwKzAwOjAwDQo=", "feed.name": "FileCollector", "classification.type": "c&c", "source.geolocation.latitude": 34.0, "source.geolocation.longitude": 9.0, "destination.ip": "251.154.65.9", "event_description.text": "ctt", "source.ip": "197.13.105.8", "time.source": "2010-02-18T00:00:00+00:00", "feed.accuracy": 100.0}
Each line is a dictionary and some dictionaries have more keys than others, and I would like to convert the text file to a csv file.
I have the following code:
import json
import csv
import ast
def json_to_csv(txt_file, csv_file):
lista = []
with open(txt_file, 'rb') as fin:
lines = fin.readlines()
for line in lines:
dict_line = ast.literal_eval(line)
lista.append(line)
list_json = json.dumps(lista)
read_json = json.loads(list_json)
header =["feed.accuracy","feed.url","source.geolocation.longitude","event_description.text","raw","destination.geolocation.city","source.ip","classification.taxonomy",
"time.observation","destination.geolocation.latitude","destination.ip","source.asn","feed.name","source.geolocation.latitude","time.source","feed.provider",
"destination.geolocation.longitude","destination.geolocation.cc","destination.asn","source.abuse_contact","source.geolocation.cc","classification.type"]
with open(csv_file, 'wb+') as f:
dict_writer = csv.DictWriter(f, header)
dict_writer.writeheader()
dict_writer.writerows(read_json)
First I read the text file, then I convert its content into JSON and then I try to write the converted data into the csv file, however its returning the following error:
Traceback (most recent call last):
File "<pyshell#38>", line 1, in <module>
json_to_csv('ctt.txt','ctt.csv')
File "C:/Users/Marisa/Documents/json_to_csv.py", line 26, in json_to_csv
dict_writer.writerows(read_json)
File "C:\Python27\lib\csv.py", line 157, in writerows
rows.append(self._dict_to_list(rowdict))
File "C:\Python27\lib\csv.py", line 148, in _dict_to_list
+ ", ".join([repr(x) for x in wrong_fields]))
ValueError: dict contains fields not in fieldnames: u'{', u'"', u'f', u'e', u'e', u'd', u'.', u'a', u'c', u'c', u'u', u'r', u'a', u'c', u'y', u'"', u':', u' ', u'1', u'0', u'0', u'.', u'0', u',', u' ', u'"', u'c', u'l', u'a', u's', u's', u'i', u'f', u'i', u'c', u'a', u't', u'i', u'o', u'n', u'.', u't', u'a', u'x',...

You're making it a little more complicated than it needs to be, and you're missing some of the fields in your own example data above. We can get rid of the ast dependency and the back & forth JSON processing, add in the missing fields, and the following will work with the sample data you've provided:
import json
import csv
def json_to_csv(txt_file, csv_file):
lista = []
with open(txt_file, 'r') as in_file:
lines = in_file.readlines()
for line in lines:
try:
dict_line = json.loads(line)
lista.append(dict_line)
except Exception as err:
print(err)
header = [
"feed.accuracy", "feed.url", "source.geolocation.longitude",
"event_description.text", "raw", "destination.geolocation.city",
"source.ip", "classification.taxonomy", "time.observation",
"destination.geolocation.latitude", "destination.ip", "source.asn",
"feed.name", "source.geolocation.latitude", "time.source",
"feed.provider", "destination.geolocation.longitude",
"destination.geolocation.cc", "destination.asn",
"source.abuse_contact", "source.geolocation.cc", "classification.type",
'destination.fqdn', 'source.fqdn', 'source.geolocation.city',
'source.url', 'destination.url'
]
with open(csv_file, 'w+') as out_file:
dict_writer = csv.DictWriter(out_file, header)
dict_writer.writeheader()
dict_writer.writerows(lista)
Note that if your real data has more fields that aren't included in your sample, you'll need to add those, too.
Note too that if your input data were a proper JSON array like:
[{"destination.fqdn": "194-65-57-128.ctt.pt", "feed.provider": "MyFeed", "source.abuse_contact": "coisas#foo.com", "raw": "bWFsd2FyZSwyMTAuMjguNTYuMSxodHRwOi8vd3d3LmN0dC5wdCAsMTk0LTY1LTU3LTEyOC5jdHQucHQsY29pc2FzQGZvby5jb20sMTk0LjIzOS4xNjcuNSx3d3cudmVyeWJhZC5jb20gLHZlcnkudmVyeWJhZC5jb20sLCwsMjAxMC0wMi0xOFQwMDowMDowMCswMDowMA0K", "feed.name": "FileCollector", "destination.geolocation.latitude": 32.2109, "destination.geolocation.cc": "CN", "source.geolocation.longitude": 12.069, "event_description.text": "ctt", "source.ip": "194.239.167.5", "source.geolocation.city": "Frederikssund", "destination.geolocation.city": "Zhenjiang", "destination.url": "http://www.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.verybad.com", "source.fqdn": "very.verybad.com", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "feed.accuracy": 100.0, "time.observation": "2017-07-18T13:15:48+00:00", "destination.geolocation.longitude": 119.4551, "source.geolocation.latitude": 55.8396, "classification.type": "malware", "destination.ip": "210.28.56.1", "time.source": "2010-02-18T00:00:00+00:00", "source.geolocation.cc": "DK"},
{"destination.url": "http://www2.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.telecom.pt", "feed.provider": "MyFeed", "time.observation": "2017-07-18T13:15:48+00:00", "destination.fqdn": "ctt-pt.mail.protection.outlook.com", "source.abuse_contact": "coisas7#foo.com", "source.geolocation.cc": "TN", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "raw": "YyZjLDI1MS4xNTQuNjUuOSxodHRwOi8vd3d3Mi5jdHQucHQsY3R0LXB0Lm1haWwucHJvdGVjdGlvbi5vdXRsb29rLmNvbSxjb2lzYXM3QGZvby5jb20sMTk3LjEzLjEwNS44LHd3dy50ZWxlY29tLnB0LCwsLCwyMDEwLTAyLTE4VDAwOjAwOjAwKzAwOjAwDQo=", "feed.name": "FileCollector", "classification.type": "c&c", "source.geolocation.latitude": 34.0, "source.geolocation.longitude": 9.0, "destination.ip": "251.154.65.9", "event_description.text": "ctt", "source.ip": "197.13.105.8", "time.source": "2010-02-18T00:00:00+00:00", "feed.accuracy": 100.0}]
the solution simplifies quite a bit more with the whole initial with open block becoming just:
with open(txt_file, 'r') as in_file:
lista = json.load(in_file)

Related

eval function doesnt turn dict-like string into dict?

So I have several strings in a DataFrame column looking like this one for example:
{'Free to Play': 17555, 'Multiplayer': 10499, 'FPS': 9248, 'Action': 8188, 'Shooter': 7857, 'Class-Based': 6098, 'Team-Based': 5363, 'Funny': 5155, 'First-Person': 4846, 'Trading': 4512, 'Cartoony': 4240, 'Competitive': 4116, 'Online Co-Op': 4016, 'Co-op': 3920, 'Robots': 3112, 'Comedy': 3049, 'Tactical': 2726, 'Crafting': 2491, 'Cartoon': 2450, 'Moddable': 2315}
I am trying to access the keys of the dict but as it is still a string I wanted to convert it into dictionaries and found people saying that eval can be used for that. And yes when I try like this it works fine and test_dict is of type dict:
test_str = "{'Early Access': 77, 'RPG': 202}"
test_dict = eval(test_str)
Yet when working with the strings in the DataFrame
tags = main_data["tags"]
for taglist in tags:
taglist = "\"" + taglist + "\""
tag_dict = eval(taglist)
tag_dict always remains a string and after some strings eval throws errors like these:
File "<string>", line 1
"{'Action': 2681, 'FPS': 2048, 'Multiplayer': 1659, 'Shooter': 1420, 'Classic': 1344, 'Team-Based': 943, 'First-Person': 799, 'Competitive': 790, 'Tactical': 734, "1990's": 564, 'e-sports': 550, 'PvP': 480, 'Military': 367, 'Strategy': 329, 'Score Attack': 200, 'Survival': 192, 'Old School': 164, 'Assassin': 151, '1980s': 144, 'Violent': 40}"
^
SyntaxError: invalid syntax
I found out it might be a problem with the length of the strings as when using taglist = "\"\"\"" + taglist + "\"\"\"" eval doesnt throw any errors, goes through all the strings but still they are not converted to a dict and remain str.
Maybe I have done some rookie mistake or there are better approaches to solving my problem?
Since you're serializing your dict to some kind of external storage, I would use json. It's designed for this, whereas eval is ... tricky. And you're actually running code, so whatever someone puts in the database, you're going to run it.
There's one catch. Json expects double quotes. Since it's already written to the database as python code with single quotes around the dictionary keys, you're going to have to convert those to double quotes to be legal json. I'd suggest fixing it once in the database, and then use json going forward.
import json
data_dict = {'Free to Play': 17555, 'Multiplayer': 10499, 'FPS': 9248, 'Action': 8188, 'Shooter': 7857, 'Class-Based': 6098, 'Team-Based': 5363, 'Funny': 5155, 'First-Person': 4846, 'Trading': 4512, 'Cartoony': 4240, 'Competitive': 4116, 'Online Co-Op': 4016, 'Co-op': 3920, 'Robots': 3112, 'Comedy': 3049, 'Tactical': 2726, 'Crafting': 2491, 'Cartoon': 2450, 'Moddable': 2315}
data_dict.update({'Early Access': 77, 'RPG': 202})
data_string = json.dumps(data_dict)
# write it to a file or database
# read it later, we'll assume that's data_string
data_dict = json.loads(data_string)
print (data_dict['RPG'])
database_string = "{'Free to Play': 17555, 'Multiplayer': 10499, 'FPS': 9248, 'Action': 8188, 'Shooter': 7857, 'Class-Based': 6098, 'Team-Based': 5363, 'Funny': 5155, 'First-Person': 4846, 'Trading': 4512, 'Cartoony': 4240, 'Competitive': 4116, 'Online Co-Op': 4016, 'Co-op': 3920, 'Robots': 3112, 'Comedy': 3049, 'Tactical': 2726, 'Crafting': 2491, 'Cartoon': 2450, 'Moddable': 2315}"
# this isn't a general purpose converter, but works for this case
# just to change the single quotes to double quotes
converted_to_legal_json = database_string.replace("'", '"')
data_dict = json.loads(converted_to_legal_json)
print (data_dict['Multiplayer'])
I can probably correct your eval if you want, but can't do it right this second. But like I said, not recommended. And I'd use ast.literal_eval rather than actually executing it with eval, for security reasons.

saving json adds backslashes [duplicate]

This question already has answers here:
Dump to JSON adds additional double quotes and escaping of quotes
(5 answers)
Closed 1 year ago.
jsonContent = json.dumps(myDict, default=convert)
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(jsonContent, f, ensure_ascii=False, indent=4)
return jsonContent
I am doing this to convert a dictionary to json and save it in a file. If I try to print the json with Python, I get an unformatted dict like this:
myDict = {'first': {'phone': 1900, 'desktop': 1577, 'tablet': 148, 'bot': 9, 'other': 1}},
This is still okay. But when I open the file, I see something like this:
"{\"first\": {\"phone\": 1900, \"desktop\": 1577, \"tablet\": 148, \"bot\": 9, \"other\": 1}ยด}"
How can I remove all the backslashes and format it properly in both, Python and the saved file?
Write to your json file like this if you want don't want the backslashes
import json
myDict = {"first": {"phone": 1900,"other": 1}, "second": {"adwords": 1419, "no_om_source": 1223}}
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(myDict, f, ensure_ascii=False, indent=4)

Exporting response.txt to csv file

I'm trying to parse data that I receive from a curl request through python. The data is in the following format:
{'meta': {'from': '1520812800',
'granularity': 'daily',
'to': '1523232000',
'total': 6380},
'data': [{'count': 660, 'date': '2018-03-12'},
{'count': 894, 'date': '2018-03-13'}]}
Originally, the data was returned as a string probably because I used response.text to retrieve the data. I converted the string into a dictionary using ast.literal_eval(response.text). I managed to parse the "data" key and ignore "meta". So currently,
data = [{"date":"2018-03-12","count":660},{"date":"2018-03-13","count":894}]}`.
I am trying to export the values for "date" and "count" to a csv file. In my code I have this:
keys = data[0].keys()
print("----------KEYS:---------")
print keys #['date','count']
print("------------------------")
with open('mycsv.csv','wb') as output_file:
thewriter = csv.DictWriter(output_file, fieldnames =
['date','count'])
thewriter.writeheader()
thewriter.writerow(data)
However, python does not like this and gives me an error:
Traceback (most recent call last):
File "curlparser.py", line 45, in <module>
thewriter.writerow(data)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/csv.py", line 152, in writerow
return self.writer.writerow(self._dict_to_list(rowdict))
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/csv.py", line 148, in _dict_to_list
+ ", ".join([repr(x) for x in wrong_fields]))
ValueError: dict contains fields not in fieldnames: {"date":"2018-03-12","count":660},{"date":"2018-03-13","count":894}

Create dictionary with different fields from different dictionaries

I have such list:
two_dimension_sizelist = \
[{u'sizeOptionId': u'1542',
u'sizeOptionName': u'1',
u'sortOrderNumber': u'915'},
{u'sizeOptionId': u'1543',
u'sizeOptionName': u'2',
u'sortOrderNumber': u'975'},
...
{u'sizeOptionId': u'1602',
u'sizeOptionName': u'Long',
u'sortOrderNumber': u'6873'}]
And I have some products:
{
"businessCatalogItemId":"5453220021802",
"inventoryStatusId":"0",
"colorName":"Medium Wash",
"sizeVariantId":"1",
"upcCode":"197476818021",
"onOrderDate":"2014-07-04T00:00:00-04:00",
"currentMinPrice":"34.94",
"currentPrice":"0.0",
"baseColorId":"1021",
"isClearanceItem":"false",
"catalogItemTypeId":"3",
"sizeDimension1Id":"1591",
"catalogItemSubtypeId":"15",
"isInStock":"true",
"skuId":"5453220021802",
"regularMaxPrice":"34.94",
"nowPrice":null,
"variantName":"Regular",
"isOnOrder":"false",
"mailOnlyReturn":"M",
"regularMinPrice":"34.94",
"reservable":"true",
"onlyAvailableOnline":"false",
"catalogItemId":"5146840",
"regularPrice":"0.0",
"isLowInventory":"false",
"sizeDimension2Id":"1601",
"priceType":"1",
"currentMaxPrice":"34.94"
},
{
"businessCatalogItemId":"5453220021803",
"inventoryStatusId":"4",
"colorName":"Medium Wash",
"sizeVariantId":"1",
"upcCode":"197476818038",
"onOrderDate":"2014-07-02T00:00:00-04:00",
"currentMinPrice":"34.94",
"currentPrice":"0.0",
"baseColorId":"1021",
"isClearanceItem":"false",
"catalogItemTypeId":"3",
"sizeDimension1Id":"1591",
"catalogItemSubtypeId":"15",
"isInStock":"true",
"skuId":"5453220021803",
"regularMaxPrice":"34.94",
"nowPrice":null,
"variantName":"Regular",
"isOnOrder":"true",
"mailOnlyReturn":"M",
"regularMinPrice":"34.94",
"reservable":"true",
"onlyAvailableOnline":"true",
"catalogItemId":"5146832",
"regularPrice":"0.0",
"isLowInventory":"false",
"sizeDimension2Id":"1602",
"priceType":"1",
"currentMaxPrice":"34.94"
}
Each product can have sizeDimension1Id and sizeDimension2Id or only sizeDimension1Id or none, I should map each size from two_diminsion_sizelist for each product to build dict:
{'size2Name': 'Long', 'size2Id': '1602', 'size1Name': '16', 'size1Id': '1590', {other product parameters}}
I have done this thing:
for dict_size in two_dimension_sizelist:
two_dimension_sizedict.update(
{dict_size['sizeOptionId']: dict_size['sizeOptionName']})
Which gives me in two_dimension_sizedict:
{u'1542': u'1',
u'1543': u'2',
u'1590': u'16',
u'1591': u'18',
u'1601': u'Regular',
u'1602': u'Long',
u'1604': u'Short',
u'1640': u'4',
u'1642': u'6',
u'1644': u'8',
u'1645': u'10',
u'1646': u'12',
u'1647': u'14'}
Am I doing right?
Now I dont properly know how to couple this sizes with products.
It seems to me that you will scan through the two_dimension_sizelist a lot of times and search for matching "sizeOptionId" from your product dictionaries.
I would suggest doing this:
all_size_list = dict((str(x['sizeOptionId']), str(x['sizeOptionName'])) for x in two_dimension_sizelist)
for product in products_list:
if "sizeDimension1Id" in product.keys():
size = product['sizeDimension1Id']
if size in all_size_list.keys():
product.update({
'size1Name': all_size_list[size],
'size1Id': size
})
# This can exist only if size1 exists
if "sizeDimension2Id" in product.keys():
size = product['sizeDimension2Id']
if size in all_size_list.keys():
product.update({
'size2Name': all_size_list[size],
'size2Id': size
})
Your product list seems like JSON string, please convert Python values using json library.

Best practice to store column-based data before to write to CSV in Python

I have the current code in Python 3:
import csv
if __name__ == '__main__':
sp500_data = [
{
'company': 'GOOGLE',
'headquarters': 'GOOGLEPLEX',
'industry': 'ADS',
'sector': 'TECH',
'symbol': 'GOOG'
},
{
'company': 'HEWLPA',
'headquarters': 'WHATEVER',
'industry': 'HARDWARE',
'sector': 'TECH',
'symbol': 'HP'
}
]
myfile = open("D:/test.csv", 'w', newline='')
wr = csv.DictWriter(myfile, delimiter='\t', quoting=csv.QUOTE_ALL, fieldnames=sp500_data[0].keys)
for sp500_company in sp500_data:
wr.writerow(sp500_company)
However this gives the following error:
Traceback (most recent call last):
File "D:\DEV\BlueTS\src\tsRetriever\dataRetriever\test.py", line 24, in <module>
wr.writerow(sp500_company)
File "C:\Python33\lib\csv.py", line 153, in writerow
return self.writer.writerow(self._dict_to_list(rowdict))
File "C:\Python33\lib\csv.py", line 146, in _dict_to_list
wrong_fields = [k for k in rowdict if k not in self.fieldnames]
File "C:\Python33\lib\csv.py", line 146, in <listcomp>
wrong_fields = [k for k in rowdict if k not in self.fieldnames]
TypeError: argument of type 'builtin_function_or_method' is not iterable
I would like to understand what I am doing wrong, and in addition to this, I would like to know what is the best way in Python to store column-based data which was originally organised in tables.
You forgot to call the .keys() method:
wr = csv.DictWriter(myfile, delimiter='\t', quoting=csv.QUOTE_ALL,
fieldnames=sp500_data[0].keys())
Note the () after sp500_data[0].keys; .keys is not an attribute, it is a method.
Using a csv.DictWriter() is an excellent method to turn data already in dictionary format into CSV data.

Categories

Resources