Extract array values from JSON with character removal in python

Extract array values from JSON with character removal in python - python

I have the following JSON string and I am trying to extract the values to a python list. I achieved getting the id_list string but I want to get every single value without the : in each of them.
EDIT:
Using python json library is not an option.
My approach (never used a lot of regex before): https://regex101.com/r/qxYe9N/1
I want to use the expression with re.filterall(EXPR, jsonstr) to receive a list like:
result = ["B01M8QSY16", "B017XBDBI6", ...more ]
{
"ajax": {
"params": {
"asinMetadataKeys": "adId",
"featureId": "SimilaritiesCarousel",
"reftagPrefix": "pd_sbs_60",
"widgetTemplateClass": "PI::Similarities::ViewTemplates::Carousel::Desktop",
"imageHeight": 160,
"linkGetParameters": "{\"pf_rd_s\":\"desktop-dp-sims\",\"pf_rd_m\":\"A3JWKAKR8XB7XF\",\"pd_rd_r\":\"ac83cd73-b019-11e8-99c8-33d23753c678\",\"pf_rd_r\":\"H21WNBAW5EGZX90ND4PN\",\"pf_rd_t\":\"40701\",\"pd_rd_wg\":\"e6DPw\",\"pf_rd_p\":\"946762da-975a-438a-9e2b-a585cbe769b5\",\"pf_rd_i\":\"desktop-dp-sims\",\"pd_rd_w\":\"xg8TH\"}",
"faceoutTemplateClass": "PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout",
"auiDeviceType": "desktop",
"imageWidth": 160,
"schemaVersion": 2,
"productDetailsTemplateClass": "PI::P13N::ViewTemplates::ProductDetails::Desktop::Base",
"forceFreshWin": 0,
"productDataFlavor": "Faceout",
"relatedRequestID": "H21WNBAW5EGZX90ND4PN",
"maxLineCount": 6
},
"id_list": ["B01M8QSY16:", "B017XBDBI6:", "B01GL5MYCE:", "B0751DHYXC:", "B01AHWOH54:", "B01M7XYENW:", "B01N7FKKXV:", "B07C1NLKS5:", "B00R25QZDC:", "B01AJB1VFW:", "B079K773M7:", "B07DX3W41P:", "B01GL5606A:", "B07654YLSB:", "B01GFL6MZE:", "B00WLI5E3M:", "B01CTE28DG:", "B01BELELVC:", "B00ZY7H91M:", "B077TPG2WK:", "B01G503MC6:", "B01LYZFC4V:", "B00ID9UQYK:", "B07C3T52LB:", "B07DX39RNS:", "B076551MZP:", "B0761RWKPQ:", "B00T8FD9YM:", "B07653JBYS:", "B07G316H74:", "B01FSEBC9K:", "B014QKBVH0:", "B01BVA2I4S:", "B01CVOZNAE:", "B07D19JDH9:", "B018ACDMJK:", "B00V0H83YW:", "B07C432PK3:", "B07B9P4T4V:", "B076H4WWLK:", "B077G3Y86F:", "B077Z7XLJF:", "B01NCFB2BB:", "B01M4I7FMC:", "B01BEVFJCM:", "B01FSEBC8G:", "B07DXCTKB6:", "B01NBHYAR0:", "B07DGWJ887:", "B00SLP58SU:", "B01N55H5AE:", "B013AZCPLS:", "B076PC3NYV:", "B01BVA2JHE:", "B07FF38J8C:", "B07DHGTS81:", "B00R25QZHS:"],
"url": "/gp/p13n-shared/faceout-partial",
"id_param_name": "asins"
},
"baseAsin": "B01GL56060",
"name": "desktop-dp-sims_session-similarities",
"set_size": 57
}
EDIT:
Raw string:
{"ajax":{"params":{"asinMetadataKeys":"adId","featureId":"SimilaritiesCarousel","reftagPrefix":"pd_sbs_193","widgetTemplateClass":"PI::Similarities::ViewTemplates::Carousel::Desktop","imageHeight":160,"linkGetParameters":"{\"pf_rd_s\":\"desktop-dp-sims\",\"pf_rd_m\":\"A3JWKAKR8XB7XF\",\"pd_rd_r\":\"e672bcd4-b03e-11e8-8dbb-41abd883f66d\",\"pf_rd_r\":\"X5Z293FJ403CC225M759\",\"pf_rd_t\":\"40701\",\"pd_rd_wg\":\"CrGGS\",\"pf_rd_p\":\"946762da-975a-438a-9e2b-a585cbe769b5\",\"pf_rd_i\":\"desktop-dp-sims\",\"pd_rd_w\":\"ktYgt\"}","faceoutTemplateClass":"PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout","auiDeviceType":"desktop","imageWidth":160,"schemaVersion":2,"productDetailsTemplateClass":"PI::P13N::ViewTemplates::ProductDetails::Desktop::Base","forceFreshWin":0,"productDataFlavor":"Faceout","relatedRequestID":"X5Z293FJ403CC225M759","maxLineCount":6},"id_list":["B07BHS22V6:","B00ITJNHX6:","B07DDGCLZ1:","B017XYQ4X2:","B01LYA8CLG:","B0747T62HS:","B00LHT0I78:","B071D5LL18:","B071NPLTRS:","B00CFMRFO0:","B01N4X1EL9:","B077R4WZ46:","B00YTZSTVY:","B073V5T8G2:","B00CFMRI7E:","B01ARIYIPM:","B0747X16FY:","B00ZWNPJVA:","B01N4WZ4AL:","B00BU662AU:","B07C2NYVMP:","B01FD7ZOB4:","B017M17VTC:","B00YTZST0K:","B07CVSJG6H:","B00V63GQBC:","B00NYBAJJY:","B01MCZ2ZQC:","B078BSJ8TV:","B077QXWJBR:","B07BL5FWVP:","B00N8SPSSU:","B01LXMVFGI:","B06ZY83D2Z:","B00ZQYY9TI:","B0761HT6JJ:","B06XRWB686:","B075XHDQ85:","B01LYJMK02:","B018JWYKRE:","B0759W61P6:","B078ZKNGRS:","B013BJBZBE:","B01LYMTVY2:","B072VMTVGZ:","B077QXW1Z9:","B07CMB96BX:","B07BNXNMZ5:","B01N3CY4Y3:","B018JX3J7U:","B0747T5MY1:","B07CQPTFDB:","B077QW292J:","B00LHT0GLQ:","B01C4B17XG:","B019WD74F4:"],"url":"/gp/p13n-shared/faceout-partial","id_param_name":"asins"},"baseAsin":"B01LS24R2U","name":"desktop-dp-sims_session-similarities","set_size":56}

just use pythons json library
import json
j1 = """{
"ajax": {
"params": {
"asinMetadataKeys": "adId",
"featureId": "SimilaritiesCarousel",
"reftagPrefix": "pd_sbs_60",
"widgetTemplateClass": "PI::Similarities::ViewTemplates::Carousel::Desktop",
"imageHeight": 160,
"faceoutTemplateClass": "PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout",
"auiDeviceType": "desktop",
"imageWidth": 160,
"schemaVersion": 2,
"productDetailsTemplateClass": "PI::P13N::ViewTemplates::ProductDetails::Desktop::Base",
"forceFreshWin": 0,
"productDataFlavor": "Faceout",
"relatedRequestID": "H21WNBAW5EGZX90ND4PN",
"maxLineCount": 6
},
"id_list": ["B01M8QSY16:", "B017XBDBI6:", "B01GL5MYCE:", "B0751DHYXC:", "B01AHWOH54:", "B01M7XYENW:", "B01N7FKKXV:", "B07C1NLKS5:", "B00R25QZDC:", "B01AJB1VFW:", "B079K773M7:", "B07DX3W41P:", "B01GL5606A:", "B07654YLSB:", "B01GFL6MZE:", "B00WLI5E3M:", "B01CTE28DG:", "B01BELELVC:", "B00ZY7H91M:", "B077TPG2WK:", "B01G503MC6:", "B01LYZFC4V:", "B00ID9UQYK:", "B07C3T52LB:", "B07DX39RNS:", "B076551MZP:", "B0761RWKPQ:", "B00T8FD9YM:", "B07653JBYS:", "B07G316H74:", "B01FSEBC9K:", "B014QKBVH0:", "B01BVA2I4S:", "B01CVOZNAE:", "B07D19JDH9:", "B018ACDMJK:", "B00V0H83YW:", "B07C432PK3:", "B07B9P4T4V:", "B076H4WWLK:", "B077G3Y86F:", "B077Z7XLJF:", "B01NCFB2BB:", "B01M4I7FMC:", "B01BEVFJCM:", "B01FSEBC8G:", "B07DXCTKB6:", "B01NBHYAR0:", "B07DGWJ887:", "B00SLP58SU:", "B01N55H5AE:", "B013AZCPLS:", "B076PC3NYV:", "B01BVA2JHE:", "B07FF38J8C:", "B07DHGTS81:", "B00R25QZHS:"],
"url": "/gp/p13n-shared/faceout-partial",
"id_param_name": "asins"
},
"baseAsin": "B01GL56060",
"name": "desktop-dp-sims_session-similarities",
"set_size": 57
}"""
d1 = json.loads(j1)
id_list = [elem.replace(":", "") for elem in d1["ajax"]['id_list']]
id_list
Output:
['B01M8QSY16',
'B017XBDBI6',
...
'B00R25QZHS']
I had to remove the line "linkGetParameters : ... " because it seems to be not json conform.

If you are sure that the attribute "id_list" will always be in one line in a similar single-space format after commas and colon, and the json module is not an option, then you can do the following:
list( # make sure the result is a list
filter( # filter to…
None, # …remove any empty items
re.split( # split the line of id_list on…
r':(?:,\s)?', # …colon and then optional comma and spaces
re.search( # search…
r'(?<="id_list": \[)((?:"[^"]+:"(?:,\s*)?)+)', j1) # …for the id_list property and its value
.group(0) # take the match
.replace('"', '') # and drop all double quotes
)))
['B01M8QSY16', 'B017XBDBI6', 'B01GL5MYCE', 'B0751DHYXC', 'B01AHWOH54', 'B01M7XYENW', 'B01N7FKKXV', 'B07C1NLKS5', 'B00R25QZDC', 'B01AJB1VFW', 'B079K773M7', 'B07DX3W41P', 'B01GL5606A', 'B07654YLSB', 'B01GFL6MZE', 'B00WLI5E3M', 'B01CTE28DG', 'B01BELELVC', 'B00ZY7H91M', 'B077TPG2WK', 'B01G503MC6', 'B01LYZFC4V', 'B00ID9UQYK', 'B07C3T52LB', 'B07DX39RNS', 'B076551MZP', 'B0761RWKPQ', 'B00T8FD9YM', 'B07653JBYS', 'B07G316H74', 'B01FSEBC9K', 'B014QKBVH0', 'B01BVA2I4S', 'B01CVOZNAE', 'B07D19JDH9', 'B018ACDMJK', 'B00V0H83YW', 'B07C432PK3', 'B07B9P4T4V', 'B076H4WWLK', 'B077G3Y86F', 'B077Z7XLJF', 'B01NCFB2BB', 'B01M4I7FMC', 'B01BEVFJCM', 'B01FSEBC8G', 'B07DXCTKB6', 'B01NBHYAR0', 'B07DGWJ887', 'B00SLP58SU', 'B01N55H5AE', 'B013AZCPLS', 'B076PC3NYV', 'B01BVA2JHE', 'B07FF38J8C', 'B07DHGTS81', 'B00R25QZHS']
This is dense and mostly unreadable code; use as-is, or I can break down more readably the logic if you want.

Seeing as you can't use the JSON library, you can try this here expression (tested on Python3):
result = [ id.strip('":') for id in re.search('"id_list": \[(.*)\],', jsonstr).group(1).split(", ") ]
(where jsonstr is a string containing all of the original JSON code).
To make it easier to understand, the above code uses
re.search (not re.filterall as you had suggested) to broadly locate and select the line,
group to narrow down the selection,
split to transform the string into a list, and
strip to trim off the unnecessary characters in each list item
leaving you with a list of IDs like the one you specify in your question.

First, as Florian H stated. You should claim valid JSON from your source in order to be able to use the json Python module. Someone who provides JSON should provide valid JSON...
EDIT: The JSON seems valid, see below
Trying to use the json module anyway to address your need, I noted that the parsing problem comes from the escaped double-quote in linkGetParameters value.
I assume the JSON string has been copied/pasted as is and this is probably the source of the JSON parsing problem. Simply pasting this JSON in a Python string makes Python use the anti-slash to escape the double quote instead of preserving the two characters.
To test the JSON content, you have to copy it into a raw string (= prefixed by a r):
import json
json_ = r"""{
"ajax": {
"params": {
"asinMetadataKeys": "adId",
"featureId": "SimilaritiesCarousel",
"reftagPrefix": "pd_sbs_60",
"widgetTemplateClass": "PI::Similarities::ViewTemplates::Carousel::Desktop",
"imageHeight": 160,
"linkGetParameters": "{\"pf_rd_s\":\"desktop-dp-sims\",\"pf_rd_m\":\"A3JWKAKR8XB7XF\",\"pd_rd_r\":\"ac83cd73-b019-11e8-99c8-33d23753c678\",\"pf_rd_r\":\"H21WNBAW5EGZX90ND4PN\",\"pf_rd_t\":\"40701\",\"pd_rd_wg\":\"e6DPw\",\"pf_rd_p\":\"946762da-975a-438a-9e2b-a585cbe769b5\",\"pf_rd_i\":\"desktop-dp-sims\",\"pd_rd_w\":\"xg8TH\"}",
"faceoutTemplateClass": "PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout",
"auiDeviceType": "desktop",
"imageWidth": 160,
"schemaVersion": 2,
"productDetailsTemplateClass": "PI::P13N::ViewTemplates::ProductDetails::Desktop::Base",
"forceFreshWin": 0,
"productDataFlavor": "Faceout",
"relatedRequestID": "H21WNBAW5EGZX90ND4PN",
"maxLineCount": 6
},
"id_list": ["B01M8QSY16:", "B017XBDBI6:", "B01GL5MYCE:", "B0751DHYXC:", "B01AHWOH54:", "B01M7XYENW:", "B01N7FKKXV:", "B07C1NLKS5:", "B00R25QZDC:", "B01AJB1VFW:", "B079K773M7:", "B07DX3W41P:", "B01GL5606A:", "B07654YLSB:", "B01GFL6MZE:", "B00WLI5E3M:", "B01CTE28DG:", "B01BELELVC:", "B00ZY7H91M:", "B077TPG2WK:", "B01G503MC6:", "B01LYZFC4V:", "B00ID9UQYK:", "B07C3T52LB:", "B07DX39RNS:", "B076551MZP:", "B0761RWKPQ:", "B00T8FD9YM:", "B07653JBYS:", "B07G316H74:", "B01FSEBC9K:", "B014QKBVH0:", "B01BVA2I4S:", "B01CVOZNAE:", "B07D19JDH9:", "B018ACDMJK:", "B00V0H83YW:", "B07C432PK3:", "B07B9P4T4V:", "B076H4WWLK:", "B077G3Y86F:", "B077Z7XLJF:", "B01NCFB2BB:", "B01M4I7FMC:", "B01BEVFJCM:", "B01FSEBC8G:", "B07DXCTKB6:", "B01NBHYAR0:", "B07DGWJ887:", "B00SLP58SU:", "B01N55H5AE:", "B013AZCPLS:", "B076PC3NYV:", "B01BVA2JHE:", "B07FF38J8C:", "B07DHGTS81:", "B00R25QZHS:"],
"url": "/gp/p13n-shared/faceout-partial",
"id_param_name": "asins"
},
"baseAsin": "B01GL56060",
"name": "desktop-dp-sims_session-similarities",
"set_size": 57
}"""
result = json.loads(json_)
print [id_[:-1] for id_ in result['ajax']['id_list']]
# [u'B01M8QSY16', u'B017XBDBI6', u'B01GL5MYCE', u'B0751DHYXC', u'B01AHWOH54', u'B01M7XYENW', u'B01N7FKKXV', u'B07C1NLKS5', u'B00R25QZDC', u'B01AJB1VFW', u'B079K773M7', u'B07DX3W41P', u'B01GL5606A', u'B07654YLSB', u'B01GFL6MZE', u'B00WLI5E3M', u'B01CTE28DG', u'B01BELELVC', u'B00ZY7H91M', u'B077TPG2WK', u'B01G503MC6', u'B01LYZFC4V', u'B00ID9UQYK', u'B07C3T52LB', u'B07DX39RNS', u'B076551MZP', u'B0761RWKPQ', u'B00T8FD9YM', u'B07653JBYS', u'B07G316H74', u'B01FSEBC9K', u'B014QKBVH0', u'B01BVA2I4S', u'B01CVOZNAE', u'B07D19JDH9', u'B018ACDMJK', u'B00V0H83YW', u'B07C432PK3', u'B07B9P4T4V', u'B076H4WWLK', u'B077G3Y86F', u'B077Z7XLJF', u'B01NCFB2BB', u'B01M4I7FMC', u'B01BEVFJCM', u'B01FSEBC8G', u'B07DXCTKB6', u'B01NBHYAR0', u'B07DGWJ887', u'B00SLP58SU', u'B01N55H5AE', u'B013AZCPLS', u'B076PC3NYV', u'B01BVA2JHE', u'B07FF38J8C', u'B07DHGTS81', u'B00R25QZHS']
Once the id_list retrieved, you can remove the last character of each id using the string slicing.
When using JSON content from your original source instead of a litteral string, you should not encounter this kind of escaping problem.
If it is really not possible, assuming an id is always 10 characters long, this should do the trick:
import re
json = """{
"ajax": {
"params": {
"asinMetadataKeys": "adId",
"featureId": "SimilaritiesCarousel",
"reftagPrefix": "pd_sbs_60",
"widgetTemplateClass": "PI::Similarities::ViewTemplates::Carousel::Desktop",
"imageHeight": 160,
"linkGetParameters": "{\"pf_rd_s\":\"desktop-dp-sims\",\"pf_rd_m\":\"A3JWKAKR8XB7XF\",\"pd_rd_r\":\"ac83cd73-b019-11e8-99c8-33d23753c678\",\"pf_rd_r\":\"H21WNBAW5EGZX90ND4PN\",\"pf_rd_t\":\"40701\",\"pd_rd_wg\":\"e6DPw\",\"pf_rd_p\":\"946762da-975a-438a-9e2b-a585cbe769b5\",\"pf_rd_i\":\"desktop-dp-sims\",\"pd_rd_w\":\"xg8TH\"}",
"faceoutTemplateClass": "PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout",
"auiDeviceType": "desktop",
"imageWidth": 160,
"schemaVersion": 2,
"productDetailsTemplateClass": "PI::P13N::ViewTemplates::ProductDetails::Desktop::Base",
"forceFreshWin": 0,
"productDataFlavor": "Faceout",
"relatedRequestID": "H21WNBAW5EGZX90ND4PN",
"maxLineCount": 6
},
"id_list": ["B01M8QSY16:", "B017XBDBI6:", "B01GL5MYCE:", "B0751DHYXC:", "B01AHWOH54:", "B01M7XYENW:", "B01N7FKKXV:", "B07C1NLKS5:", "B00R25QZDC:", "B01AJB1VFW:", "B079K773M7:", "B07DX3W41P:", "B01GL5606A:", "B07654YLSB:", "B01GFL6MZE:", "B00WLI5E3M:", "B01CTE28DG:", "B01BELELVC:", "B00ZY7H91M:", "B077TPG2WK:", "B01G503MC6:", "B01LYZFC4V:", "B00ID9UQYK:", "B07C3T52LB:", "B07DX39RNS:", "B076551MZP:", "B0761RWKPQ:", "B00T8FD9YM:", "B07653JBYS:", "B07G316H74:", "B01FSEBC9K:", "B014QKBVH0:", "B01BVA2I4S:", "B01CVOZNAE:", "B07D19JDH9:", "B018ACDMJK:", "B00V0H83YW:", "B07C432PK3:", "B07B9P4T4V:", "B076H4WWLK:", "B077G3Y86F:", "B077Z7XLJF:", "B01NCFB2BB:", "B01M4I7FMC:", "B01BEVFJCM:", "B01FSEBC8G:", "B07DXCTKB6:", "B01NBHYAR0:", "B07DGWJ887:", "B00SLP58SU:", "B01N55H5AE:", "B013AZCPLS:", "B076PC3NYV:", "B01BVA2JHE:", "B07FF38J8C:", "B07DHGTS81:", "B00R25QZHS:"],
"url": "/gp/p13n-shared/faceout-partial",
"id_param_name": "asins"
},
"baseAsin": "B01GL56060",
"name": "desktop-dp-sims_session-similarities",
"set_size": 57
}"""
# https://regex101.com/r/qxYe9N/11
id_re = re.compile('"([A-Z0-9]{10}):"')
result = id_re.findall(json)
print result
# ['B01M8QSY16', 'B017XBDBI6', 'B01GL5MYCE', 'B0751DHYXC', 'B01AHWOH54', 'B01M7XYENW', 'B01N7FKKXV', 'B07C1NLKS5', 'B00R25QZDC', 'B01AJB1VFW', 'B079K773M7', 'B07DX3W41P', 'B01GL5606A', 'B07654YLSB', 'B01GFL6MZE', 'B00WLI5E3M', 'B01CTE28DG', 'B01BELELVC', 'B00ZY7H91M', 'B077TPG2WK', 'B01G503MC6', 'B01LYZFC4V', 'B00ID9UQYK', 'B07C3T52LB', 'B07DX39RNS', 'B076551MZP', 'B0761RWKPQ', 'B00T8FD9YM', 'B07653JBYS', 'B07G316H74', 'B01FSEBC9K', 'B014QKBVH0', 'B01BVA2I4S', 'B01CVOZNAE', 'B07D19JDH9', 'B018ACDMJK', 'B00V0H83YW', 'B07C432PK3', 'B07B9P4T4V', 'B076H4WWLK', 'B077G3Y86F', 'B077Z7XLJF', 'B01NCFB2BB', 'B01M4I7FMC', 'B01BEVFJCM', 'B01FSEBC8G', 'B07DXCTKB6', 'B01NBHYAR0', 'B07DGWJ887', 'B00SLP58SU', 'B01N55H5AE', 'B013AZCPLS', 'B076PC3NYV', 'B01BVA2JHE', 'B07FF38J8C', 'B07DHGTS81', 'B00R25QZHS']

Related

A regex for CSV parsing? Python3 Re module

Is there a regex (Python re compatible) that I can use for parsing csv?

EDIT: I didn't realize there was a csv module in Python's standard library
Here's the regex: (?<!,\"\w)\s*,(?!\w\s*\",). It's python compatible and JavaScript compatible. Here's the full parsing script (as a python function):
def parseCSV(csvDoc, output_type="dict"):
from re import compile as c
from json import dumps
from numpy import array
# This is where all the parsing happens
"""
To parse csv files.
Arguments:
csvDoc - The csv document to parse.
output_type - the output type this
function will return
"""
csvparser = c('(?<!,\"\\w)\\s*,(?!\\w\\s*\",)')
lines = str(csvDoc).split('\n')
# All the lines are not empty
necessary_lines = [line for line in lines if line != ""]
All = array([csvparser.split(line) for line in necessary_lines])
if output_type.lower() in ("dict", "json"): # If you want JSON or dict
# All the python dict keys required (At the top of the file or top row)
top_line = list(All[0])
main_table = {} # The parsed data will be here
main_table[top_line[0]] = {
name[0]: {
thing: name[
# The 'actual value' counterpart
top_line.index(thing)
] for thing in top_line[1:] # The requirements
} for name in All[1:]
}
return dumps(main_table, skipkeys=True, ensure_ascii=False, indent=1)
elif output_type.lower() in ("list",
"numpy",
"array",
"matrix",
"np.array",
"np.ndarray",
"numpy.array",
"numpy.ndarray"):
return All
else:
# All the python dict keys required (At the top of the file or top row)
top_line = list(All[0])
main_table = {} # The parsed data will be here
main_table[top_line[0]] = {
name[0]: {
thing: name[
# The 'actual value' counterpart
top_line.index(thing)
] for thing in top_line[1:] # The requirements
} for name in All[1:]
}
return dumps(main_table, skipkeys=True, ensure_ascii=False, indent=1)
Dependancies: NumPy
All you need to do is chuck in the raw text of the csv file and then the function will return a json (or a 2-dimension list if you wish) in this format:
{"top-left-corner name":{
"foo":{"Item 1 left to foo":"Item 2 of the top row",
"Item 2 left to foo":"Item 3 of the top row",
...}
"bar":{...}
}
}
And here's an example of it:
CSV.csv
foo,bar,zbar
foo_row,foo1,,
barie,"2,000",,
and it outputs:
{
"foo": {
"foo_row": {
"bar": "foo1",
"zbar": ""
},
"barie": {
"bar": "\"2,000\"",
"zbar": ""
}
}
}
It should work if your csv file is formatted correctly (The ones I tested was made by apple's Numbers)

How to parse tab-delimited text file with 4th column as json and remove certain keys?

I have a text file that is 26 Gb, The line format is as follow
/type/edition /books/OL10000135M 4 2010-04-24T17:54:01.503315 {"publishers": ["Bernan Press"], "physical_format": "Hardcover", "subtitle": "9th November - 3rd December, 1992", "key": "/books/OL10000135M", "title": "Parliamentary Debates, House of Lords, Bound Volumes, 1992-93", "identifiers": {"goodreads": ["6850240"]}, "isbn_13": ["9780107805401"], "languages": [{"key": "/languages/eng"}], "number_of_pages": 64, "isbn_10": ["0107805405"], "publish_date": "December 1993", "last_modified": {"type": "/type/datetime", "value": "2010-04-24T17:54:01.503315"}, "authors": [{"key": "/authors/OL2645777A"}], "latest_revision": 4, "works": [{"key": "/works/OL7925046W"}], "type": {"key": "/type/edition"}, "subjects": ["Government - Comparative", "Politics / Current Events"], "revision": 4}
I'm trying to get only the last columns which is a json and from that Json I'm only trying to save the "title", "isbn 13", "isbn 10"
I was able to save only the last column with this code
csv.field_size_limit(sys.maxsize)
# File names: to read in from and read out to
input_file = '../inputFile/ol_dump_editions_2019-10-31.txt'
output_file = '../outputFile/output.txt'
## ==================== ##
## Using module 'csv' ##
## ==================== ##
with open(input_file) as to_read:
with open(output_file, "w") as tmp_file:
reader = csv.reader(to_read, delimiter = "\t")
writer = csv.writer(tmp_file)
desired_column = [4] # text column
for row in reader: # read one row at a time
myColumn = list(row[i] for i in desired_column) # build the output row (process)
writer.writerow(myColumn) # write it
but this doesn't return a proper json object instead returns everything with a double quotations next to it. Also how would I extract certain values from the json as a new json
EDIT:
"{""publishers"": [""Bernan Press""], ""physical_format"": ""Hardcover"", ""subtitle"": ""9th November - 3rd December, 1992"", ""key"": ""/books/OL10000135M"", ""title"": ""Parliamentary Debates, House of Lords, Bound Volumes, 1992-93"", ""identifiers"": {""goodreads"": [""6850240""]}, ""isbn_13"": [""9780107805401""], ""languages"": [{""key"": ""/languages/eng""}], ""number_of_pages"": 64, ""isbn_10"": [""0107805405""], ""publish_date"": ""December 1993"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2010-04-24T17:54:01.503315""}, ""authors"": [{""key"": ""/authors/OL2645777A""}], ""latest_revision"": 4, ""works"": [{""key"": ""/works/OL7925046W""}], ""type"": {""key"": ""/type/edition""}, ""subjects"": [""Government - Comparative"", ""Politics / Current Events""], ""revision"": 4}"
EDIT 2:
so im trying to read this file which is a tab separated file with the following columns:
type - type of record (/type/edition, /type/work etc.)
key - unique key of the record. (/books/OL1M etc.)
revision - revision number of the record
last_modified - last modified timestamp
JSON - the complete record in JSON format
Im trying to read the JSON file and from that Json im only trying to get the "title", "isbn 13", "isbn 10" as a json and save it to the file as a row
so every row should look like the original but with only those key and values

Here's a straight-forward way of doing it. You would need to repeat this and extract the desired data from each line of the file as it's being read, line-by-line (the default way text file reading is handled in Python).
import json
line = '/type/edition /books/OL10000135M 4 2010-04-24T17:54:01.503315 {"publishers": ["Bernan Press"], "physical_format": "Hardcover", "subtitle": "9th November - 3rd December, 1992", "key": "/books/OL10000135M", "title": "Parliamentary Debates, House of Lords, Bound Volumes, 1992-93", "identifiers": {"goodreads": ["6850240"]}, "isbn_13": ["9780107805401"], "languages": [{"key": "/languages/eng"}], "number_of_pages": 64, "isbn_10": ["0107805405"], "publish_date": "December 1993", "last_modified": {"type": "/type/datetime", "value": "2010-04-24T17:54:01.503315"}, "authors": [{"key": "/authors/OL2645777A"}], "latest_revision": 4, "works": [{"key": "/works/OL7925046W"}], "type": {"key": "/type/edition"}, "subjects": ["Government - Comparative", "Politics / Current Events"], "revision": 4}'
csv_cols = line.split('\t')
json_data = json.loads(csv_cols[4])
#print(json.dumps(json_data, indent=4))
desired = {key: json_data[key] for key in ("title", "isbn_13", "isbn_10")}
result = json.dumps(desired, indent=4)
print(result)
Output from sample line:
{
"title": "Parliamentary Debates, House of Lords, Bound Volumes, 1992-93",
"isbn_13": [
"9780107805401"
],
"isbn_10": [
"0107805405"
]
}

So given that your current code returns the following:
result = '{""publishers"": [""Bernan Press""], ""physical_format"": ""Hardcover"", ""subtitle"": ""9th November - 3rd December, 1992"", ""key"": ""/books/OL10000135M"", ""title"": ""Parliamentary Debates, House of Lords, Bound Volumes, 1992-93"", ""identifiers"": {""goodreads"": [""6850240""]}, ""isbn_13"": [""9780107805401""], ""languages"": [{""key"": ""/languages/eng""}], ""number_of_pages"": 64, ""isbn_10"": [""0107805405""], ""publish_date"": ""December 1993"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2010-04-24T17:54:01.503315""}, ""authors"": [{""key"": ""/authors/OL2645777A""}], ""latest_revision"": 4, ""works"": [{""key"": ""/works/OL7925046W""}], ""type"": {""key"": ""/type/edition""}, ""subjects"": [""Government - Comparative"", ""Politics / Current Events""], ""revision"": 4}'
Looks like what you need to do is: First - Replace those double-double-quotes with regular double quotes, otherwise things are not parsible:
res = result.replace('""','"')
Now res is convertible to a JSON object:
import json
my_json = json.loads(res)
my_json now looks like this:
{'authors': [{'key': '/authors/OL2645777A'}],
'identifiers': {'goodreads': ['6850240']},
'isbn_10': ['0107805405'],
'isbn_13': ['9780107805401'],
'key': '/books/OL10000135M',
'languages': [{'key': '/languages/eng'}],
'last_modified': {'type': '/type/datetime',
'value': '2010-04-24T17:54:01.503315'},
'latest_revision': 4,
'number_of_pages': 64,
'physical_format': 'Hardcover',
'publish_date': 'December 1993',
'publishers': ['Bernan Press'],
'revision': 4,
'subjects': ['Government - Comparative', 'Politics / Current Events'],
'subtitle': '9th November - 3rd December, 1992',
'title': 'Parliamentary Debates, House of Lords, Bound Volumes, 1992-93',
'type': {'key': '/type/edition'},
'works': [{'key': '/works/OL7925046W'}]}
You can conveniently get any field you want from this object:
my_json['title']
# 'Parliamentary Debates, House of Lords, Bound Volumes, 1992-93'
my_json['isbn_10'][0]
# '0107805405'

Especially because your example is so large, I'd recommend using a specialized library such as pandas, which has a read_csv method, or even dask, which supports out-of-memory operations.
Both of these systems will automatically parse out the quotations for you, and dask will do so in "pieces" direct from disk so you never have to try to load 26GB into RAM.
In both libraries, you can then access the columns you want like this:
data = read_csv(PATH)
data["ColumnName"]
You can then parse these rows either using json.loads() (import json) or you can use the pandas/dask json implementations. If you can give some more details of what you're expecting, I can help you draft a more specific code example.
Good luck!

I saved your data to a file to see if i could read just the rows, let me know if this works:
lines = zzread.split('\n')
temp=[]
for to_read in lines:
if len(to_read) == 0:
break
new_to_read = '{' + to_read.split('{',1)[1]
temp.append(json.loads(new_to_read))
for row in temp:
print(row['isbn_13'])
If that works this should create a json for you:
lines = zzread.split('\n')
temp=[]
for to_read in lines:
if len(to_read) == 0:
break
new_to_read = '{' + to_read.split('{',1)[1]
temp.append(json.loads(new_to_read))
new_json=[]
for row in temp:
new_json.append({'title': row['title'], 'isbn_13': row['isbn_13'], 'isbn_10': row['isbn_10']})

Dictionary from a String with particular structure

I am using python 3 to read this file and convert it to a dictionary.
I have this string from a file and I would like to know how could be possible to create a dictionary from it.
[User]
Date=10/26/2003
Time=09:01:01 AM
User=teodor
UserText=Max Cor
UserTextUnicode=392039n9dj90j32
[System]
Type=Absolute
Dnumber=QS236
Software=1.1.1.2
BuildNr=0923875
Source=LAM
Column=OWKD
[Build]
StageX=12345
Spotter=2
ApertureX=0.0098743
ApertureY=0.2431899
ShiftXYZ=-4.234809e-002
[Text]
Text=Here is the Text files
DataBaseNumber=The database number is 918723
..... (There are more than 1000 lines per file) ...
On the text I have "Name=Something" and then I would like to convert it as follows:
{'Date':'10/26/2003',
'Time':'09:01:01 AM'
'User':'teodor'
'UserText':'Max Cor'
'UserTextUnicode':'392039n9dj90j32'.......}
The word between [ ] can be removed, like [User], [System], [Build], [Text], etc...
In some fields there is only the first part of the string:
[Colors]
Red=
Blue=
Yellow=
DarkBlue=

What you have is an ordinary properties file. You can use this example to read the values into map:
try (InputStream input = new FileInputStream("your_file_path")) {
Properties prop = new Properties();
prop.load(input);
// prop.getProperty("User") == "teodor"
} catch (IOException ex) {
ex.printStackTrace();
}
EDIT:
For Python solution, refer to the answerred question.
You can use configparser to read .ini, or .properties files (format you have).
import configparser
config = configparser.ConfigParser()
config.read('your_file_path')
# config['User'] == {'Date': '10/26/2003', 'Time': '09:01:01 AM'...}
# config['User']['User'] == 'teodor'
# config['System'] == {'Type': 'Abosulte', ...}

Can easily be done in python. Assuming your file is named test.txt.
This will also work for lines with nothing after the = as well as lines with multiple =.
d = {}
with open('test.txt', 'r') as f:
for line in f:
line = line.strip() # Remove any space or newline characters
parts = line.split('=') # Split around the `=`
if len(parts) > 1:
d[parts[0]] = ''.join(parts[1:])
print(d)
Output:
{
"Date": "10/26/2003",
"Time": "09:01:01 AM",
"User": "teodor",
"UserText": "Max Cor",
"UserTextUnicode": "392039n9dj90j32",
"Type": "Absolute",
"Dnumber": "QS236",
"Software": "1.1.1.2",
"BuildNr": "0923875",
"Source": "LAM",
"Column": "OWKD",
"StageX": "12345",
"Spotter": "2",
"ApertureX": "0.0098743",
"ApertureY": "0.2431899",
"ShiftXYZ": "-4.234809e-002",
"Text": "Here is the Text files",
"DataBaseNumber": "The database number is 918723"
}

I would suggest to do some cleaning to get rid of the [] lines.
After that you can split those lines by the "=" separator and then convert it to a dictionary.

Dynamically double-quote "keys" in text to form valid JSON string in python

I'm working with text contained in JS variables on a webpage and extracting strings using regex, then turning it into JSON objects in python using json.loads().
The issue I'm having is the unquoted "keys". Right now, I'm doing a series of replacements (code below) to "" each key in each string, but what I want is to dynamically identify any unquoted keys before passing the string into json.loads().
Example 1 with no space after : character
json_data1 = '[{storeName:"testName",address:"12345 Road",address2:"Suite 500",city:"testCity",storeImage:"http://www.testLink.com",state:"testState",phone:"999-999-9999",lat:99.9999,lng:-99.9999}]'
Example 2 with space after : character
json_data2 = '[{storeName: "testName",address: "12345 Road",address2: "Suite 500",city: "testCity",storeImage: "http://www.testLink.com",state: "testState",phone: "999-999-9999",lat: 99.9999,lng: -99.9999}]'
Example 3 with space after ,: characters
json_data3 = '[{storeName: "testName", address: "12345 Road", address2: "Suite 500", city: "testCity", storeImage: "http://www.testLink.com", state: "testState", phone: "999-999-9999", lat: 99.9999, lng: -99.9999}]'
Example 4 with space after : character and newlines
json_data4 = '''[
{
storeName: "testName",
address: "12345 Road",
address2: "Suite 500",
city: "testCity",
storeImage: "http://www.testLink.com",
state: "testState",
phone: "999-999-9999",
lat: 99.9999, lng: -99.9999
}]'''
I need to create pattern that identifies which are keys and not random string values containing characters such as the string link in storeImage. In other words, I want to dynamically find keys and double-quote them to use json.loads() and return a valid JSON object.
I'm currently replacing each key in the text this way
content = re.sub('storeName:', '"storeName":', content)
content = re.sub('address:', '"address":', content)
content = re.sub('address2:', '"address2":', content)
content = re.sub('city:', '"city":', content)
content = re.sub('storeImage:', '"storeImage":', content)
content = re.sub('state:', '"state":', content)
content = re.sub('phone:', '"phone":', content)
content = re.sub('lat:', '"lat":', content)
content = re.sub('lng:', '"lng":', content)
Returned as string representing valid JSON
json_data = [{"storeName": "testName", "address": "12345 Road", "address2": "Suite 500", "city": "testCity", "storeImage": "http://www.testLink.com", "state": "testState", "phone": "999-999-9999", "lat": 99.9999, "lng": -99.9999}]
I'm sure there is a better way of doing this but I haven't been able to find or come up with a regex pattern to handle these. Any help is greatly appreciated!

Something like this should do the job: ([{,]\s*)([^"':]+)(\s*:)
Replace for: \1"\2"\3
Example: https://regex101.com/r/oV0udR/1

That repetition is of course unnecessary. You could put everything into a single regex:
content = re.sub(r"\b(storeName|address2?|city|storeImage|state|phone|lat|lng):", r'"\1":', content)
\1 contains the match within the first (in this case, only) set of parentheses, so "\1": surrounds it with quotes and adds back the colon.
Note the use of a word boundary anchor to make sure we match only those exact words.

Regex: (\w+)\s?:\s?("?[^",]+"?,?)
Regex demo
import re
text = 'storeName: "testName", '
text = re.sub('(\w+)\s?:\s?("?[^",]+"?,?)', "\"\g<1>\":\g<2>", text)
print(text)
Output: "storeName":"testName",

Getting certain information from string

I'm new to python as was wondering how I could get the estimatedWait and routeName from this string.
{
"lastUpdated": "07:52",
"filterOut": [],
"arrivals": [
{
"routeId": "B16",
"routeName": "B16",
"destination": "Kidbrooke",
"estimatedWait": "due",
"scheduledTime": "06: 53",
"isRealTime": true,
"isCancelled": false
},
{
"routeId":"B13",
"routeName":"B13",
"destination":"New Eltham",
"estimatedWait":"29 min",
"scheduledTime":"07:38",
"isRealTime":true,
"isCancelled":false
}
],
"serviceDisruptions":{
"infoMessages":[],
"importantMessages":[],
"criticalMessages":[]
}
}
And then save this to another string which would be displayed on the lxterminal of the raspberry pi 2. I would like only the 'routeName' of B16 to be saved to the string. How do I do that?

You just have to deserialise the object and then use the index to access the data you want.
To find only the B16 entries you can filter the arrivals list.
import json
obj = json.loads(json_string)
# filter only the b16 objects
b16_objs = filter(lambda a: a['routeName'] == 'B16', obj['arrivals'])
if b16_objs:
# get the first item
b16 = b16_objs[0]
my_estimatedWait = b16['estimatedWait']
print(my_estimatedWait)

You can use string.find() to get the indices of those value identifiers
and extract them.
Example:
def get_vaules(string):
waitIndice = string.find('"estimatedWait":"')
routeIndice = string.find('"routeName":"')
estimatedWait = string[waitIndice:string.find('"', waitIndice)]
routeName = string[routeIndice:string.find('"', routeIndice)]
return estimatedWait, routeName
Or you could just deserialize the json object (highly recommended)
import json
def get_values(string):
jsonData = json.loads(string)
estimatedWait = jsonData['arrivals'][0]['estimatedWait']
routeName = jsonData['arrivals'][0]['routeName']
return estimatedWait, routeName
Parsing values from a JSON file using Python?

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extract array values from JSON with character removal in python - python

Related

A regex for CSV parsing? Python3 Re module

How to parse tab-delimited text file with 4th column as json and remove certain keys?

Dictionary from a String with particular structure

Dynamically double-quote "keys" in text to form valid JSON string in python

Getting certain information from string

Categories

Resources