I have a text file that is 26 Gb, The line format is as follow
/type/edition /books/OL10000135M 4 2010-04-24T17:54:01.503315 {"publishers": ["Bernan Press"], "physical_format": "Hardcover", "subtitle": "9th November - 3rd December, 1992", "key": "/books/OL10000135M", "title": "Parliamentary Debates, House of Lords, Bound Volumes, 1992-93", "identifiers": {"goodreads": ["6850240"]}, "isbn_13": ["9780107805401"], "languages": [{"key": "/languages/eng"}], "number_of_pages": 64, "isbn_10": ["0107805405"], "publish_date": "December 1993", "last_modified": {"type": "/type/datetime", "value": "2010-04-24T17:54:01.503315"}, "authors": [{"key": "/authors/OL2645777A"}], "latest_revision": 4, "works": [{"key": "/works/OL7925046W"}], "type": {"key": "/type/edition"}, "subjects": ["Government - Comparative", "Politics / Current Events"], "revision": 4}
I'm trying to get only the last columns which is a json and from that Json I'm only trying to save the "title", "isbn 13", "isbn 10"
I was able to save only the last column with this code
csv.field_size_limit(sys.maxsize)
# File names: to read in from and read out to
input_file = '../inputFile/ol_dump_editions_2019-10-31.txt'
output_file = '../outputFile/output.txt'
## ==================== ##
## Using module 'csv' ##
## ==================== ##
with open(input_file) as to_read:
with open(output_file, "w") as tmp_file:
reader = csv.reader(to_read, delimiter = "\t")
writer = csv.writer(tmp_file)
desired_column = [4] # text column
for row in reader: # read one row at a time
myColumn = list(row[i] for i in desired_column) # build the output row (process)
writer.writerow(myColumn) # write it
but this doesn't return a proper json object instead returns everything with a double quotations next to it. Also how would I extract certain values from the json as a new json
EDIT:
"{""publishers"": [""Bernan Press""], ""physical_format"": ""Hardcover"", ""subtitle"": ""9th November - 3rd December, 1992"", ""key"": ""/books/OL10000135M"", ""title"": ""Parliamentary Debates, House of Lords, Bound Volumes, 1992-93"", ""identifiers"": {""goodreads"": [""6850240""]}, ""isbn_13"": [""9780107805401""], ""languages"": [{""key"": ""/languages/eng""}], ""number_of_pages"": 64, ""isbn_10"": [""0107805405""], ""publish_date"": ""December 1993"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2010-04-24T17:54:01.503315""}, ""authors"": [{""key"": ""/authors/OL2645777A""}], ""latest_revision"": 4, ""works"": [{""key"": ""/works/OL7925046W""}], ""type"": {""key"": ""/type/edition""}, ""subjects"": [""Government - Comparative"", ""Politics / Current Events""], ""revision"": 4}"
EDIT 2:
so im trying to read this file which is a tab separated file with the following columns:
type - type of record (/type/edition, /type/work etc.)
key - unique key of the record. (/books/OL1M etc.)
revision - revision number of the record
last_modified - last modified timestamp
JSON - the complete record in JSON format
Im trying to read the JSON file and from that Json im only trying to get the "title", "isbn 13", "isbn 10" as a json and save it to the file as a row
so every row should look like the original but with only those key and values
Here's a straight-forward way of doing it. You would need to repeat this and extract the desired data from each line of the file as it's being read, line-by-line (the default way text file reading is handled in Python).
import json
line = '/type/edition /books/OL10000135M 4 2010-04-24T17:54:01.503315 {"publishers": ["Bernan Press"], "physical_format": "Hardcover", "subtitle": "9th November - 3rd December, 1992", "key": "/books/OL10000135M", "title": "Parliamentary Debates, House of Lords, Bound Volumes, 1992-93", "identifiers": {"goodreads": ["6850240"]}, "isbn_13": ["9780107805401"], "languages": [{"key": "/languages/eng"}], "number_of_pages": 64, "isbn_10": ["0107805405"], "publish_date": "December 1993", "last_modified": {"type": "/type/datetime", "value": "2010-04-24T17:54:01.503315"}, "authors": [{"key": "/authors/OL2645777A"}], "latest_revision": 4, "works": [{"key": "/works/OL7925046W"}], "type": {"key": "/type/edition"}, "subjects": ["Government - Comparative", "Politics / Current Events"], "revision": 4}'
csv_cols = line.split('\t')
json_data = json.loads(csv_cols[4])
#print(json.dumps(json_data, indent=4))
desired = {key: json_data[key] for key in ("title", "isbn_13", "isbn_10")}
result = json.dumps(desired, indent=4)
print(result)
Output from sample line:
{
"title": "Parliamentary Debates, House of Lords, Bound Volumes, 1992-93",
"isbn_13": [
"9780107805401"
],
"isbn_10": [
"0107805405"
]
}
So given that your current code returns the following:
result = '{""publishers"": [""Bernan Press""], ""physical_format"": ""Hardcover"", ""subtitle"": ""9th November - 3rd December, 1992"", ""key"": ""/books/OL10000135M"", ""title"": ""Parliamentary Debates, House of Lords, Bound Volumes, 1992-93"", ""identifiers"": {""goodreads"": [""6850240""]}, ""isbn_13"": [""9780107805401""], ""languages"": [{""key"": ""/languages/eng""}], ""number_of_pages"": 64, ""isbn_10"": [""0107805405""], ""publish_date"": ""December 1993"", ""last_modified"": {""type"": ""/type/datetime"", ""value"": ""2010-04-24T17:54:01.503315""}, ""authors"": [{""key"": ""/authors/OL2645777A""}], ""latest_revision"": 4, ""works"": [{""key"": ""/works/OL7925046W""}], ""type"": {""key"": ""/type/edition""}, ""subjects"": [""Government - Comparative"", ""Politics / Current Events""], ""revision"": 4}'
Looks like what you need to do is: First - Replace those double-double-quotes with regular double quotes, otherwise things are not parsible:
res = result.replace('""','"')
Now res is convertible to a JSON object:
import json
my_json = json.loads(res)
my_json now looks like this:
{'authors': [{'key': '/authors/OL2645777A'}],
'identifiers': {'goodreads': ['6850240']},
'isbn_10': ['0107805405'],
'isbn_13': ['9780107805401'],
'key': '/books/OL10000135M',
'languages': [{'key': '/languages/eng'}],
'last_modified': {'type': '/type/datetime',
'value': '2010-04-24T17:54:01.503315'},
'latest_revision': 4,
'number_of_pages': 64,
'physical_format': 'Hardcover',
'publish_date': 'December 1993',
'publishers': ['Bernan Press'],
'revision': 4,
'subjects': ['Government - Comparative', 'Politics / Current Events'],
'subtitle': '9th November - 3rd December, 1992',
'title': 'Parliamentary Debates, House of Lords, Bound Volumes, 1992-93',
'type': {'key': '/type/edition'},
'works': [{'key': '/works/OL7925046W'}]}
You can conveniently get any field you want from this object:
my_json['title']
# 'Parliamentary Debates, House of Lords, Bound Volumes, 1992-93'
my_json['isbn_10'][0]
# '0107805405'
Especially because your example is so large, I'd recommend using a specialized library such as pandas, which has a read_csv method, or even dask, which supports out-of-memory operations.
Both of these systems will automatically parse out the quotations for you, and dask will do so in "pieces" direct from disk so you never have to try to load 26GB into RAM.
In both libraries, you can then access the columns you want like this:
data = read_csv(PATH)
data["ColumnName"]
You can then parse these rows either using json.loads() (import json) or you can use the pandas/dask json implementations. If you can give some more details of what you're expecting, I can help you draft a more specific code example.
Good luck!
I saved your data to a file to see if i could read just the rows, let me know if this works:
lines = zzread.split('\n')
temp=[]
for to_read in lines:
if len(to_read) == 0:
break
new_to_read = '{' + to_read.split('{',1)[1]
temp.append(json.loads(new_to_read))
for row in temp:
print(row['isbn_13'])
If that works this should create a json for you:
lines = zzread.split('\n')
temp=[]
for to_read in lines:
if len(to_read) == 0:
break
new_to_read = '{' + to_read.split('{',1)[1]
temp.append(json.loads(new_to_read))
new_json=[]
for row in temp:
new_json.append({'title': row['title'], 'isbn_13': row['isbn_13'], 'isbn_10': row['isbn_10']})
I have the following JSON string and I am trying to extract the values to a python list. I achieved getting the id_list string but I want to get every single value without the : in each of them.
EDIT:
Using python json library is not an option.
My approach (never used a lot of regex before): https://regex101.com/r/qxYe9N/1
I want to use the expression with re.filterall(EXPR, jsonstr) to receive a list like:
result = ["B01M8QSY16", "B017XBDBI6", ...more ]
{
"ajax": {
"params": {
"asinMetadataKeys": "adId",
"featureId": "SimilaritiesCarousel",
"reftagPrefix": "pd_sbs_60",
"widgetTemplateClass": "PI::Similarities::ViewTemplates::Carousel::Desktop",
"imageHeight": 160,
"linkGetParameters": "{\"pf_rd_s\":\"desktop-dp-sims\",\"pf_rd_m\":\"A3JWKAKR8XB7XF\",\"pd_rd_r\":\"ac83cd73-b019-11e8-99c8-33d23753c678\",\"pf_rd_r\":\"H21WNBAW5EGZX90ND4PN\",\"pf_rd_t\":\"40701\",\"pd_rd_wg\":\"e6DPw\",\"pf_rd_p\":\"946762da-975a-438a-9e2b-a585cbe769b5\",\"pf_rd_i\":\"desktop-dp-sims\",\"pd_rd_w\":\"xg8TH\"}",
"faceoutTemplateClass": "PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout",
"auiDeviceType": "desktop",
"imageWidth": 160,
"schemaVersion": 2,
"productDetailsTemplateClass": "PI::P13N::ViewTemplates::ProductDetails::Desktop::Base",
"forceFreshWin": 0,
"productDataFlavor": "Faceout",
"relatedRequestID": "H21WNBAW5EGZX90ND4PN",
"maxLineCount": 6
},
"id_list": ["B01M8QSY16:", "B017XBDBI6:", "B01GL5MYCE:", "B0751DHYXC:", "B01AHWOH54:", "B01M7XYENW:", "B01N7FKKXV:", "B07C1NLKS5:", "B00R25QZDC:", "B01AJB1VFW:", "B079K773M7:", "B07DX3W41P:", "B01GL5606A:", "B07654YLSB:", "B01GFL6MZE:", "B00WLI5E3M:", "B01CTE28DG:", "B01BELELVC:", "B00ZY7H91M:", "B077TPG2WK:", "B01G503MC6:", "B01LYZFC4V:", "B00ID9UQYK:", "B07C3T52LB:", "B07DX39RNS:", "B076551MZP:", "B0761RWKPQ:", "B00T8FD9YM:", "B07653JBYS:", "B07G316H74:", "B01FSEBC9K:", "B014QKBVH0:", "B01BVA2I4S:", "B01CVOZNAE:", "B07D19JDH9:", "B018ACDMJK:", "B00V0H83YW:", "B07C432PK3:", "B07B9P4T4V:", "B076H4WWLK:", "B077G3Y86F:", "B077Z7XLJF:", "B01NCFB2BB:", "B01M4I7FMC:", "B01BEVFJCM:", "B01FSEBC8G:", "B07DXCTKB6:", "B01NBHYAR0:", "B07DGWJ887:", "B00SLP58SU:", "B01N55H5AE:", "B013AZCPLS:", "B076PC3NYV:", "B01BVA2JHE:", "B07FF38J8C:", "B07DHGTS81:", "B00R25QZHS:"],
"url": "/gp/p13n-shared/faceout-partial",
"id_param_name": "asins"
},
"baseAsin": "B01GL56060",
"name": "desktop-dp-sims_session-similarities",
"set_size": 57
}
EDIT:
Raw string:
{"ajax":{"params":{"asinMetadataKeys":"adId","featureId":"SimilaritiesCarousel","reftagPrefix":"pd_sbs_193","widgetTemplateClass":"PI::Similarities::ViewTemplates::Carousel::Desktop","imageHeight":160,"linkGetParameters":"{\"pf_rd_s\":\"desktop-dp-sims\",\"pf_rd_m\":\"A3JWKAKR8XB7XF\",\"pd_rd_r\":\"e672bcd4-b03e-11e8-8dbb-41abd883f66d\",\"pf_rd_r\":\"X5Z293FJ403CC225M759\",\"pf_rd_t\":\"40701\",\"pd_rd_wg\":\"CrGGS\",\"pf_rd_p\":\"946762da-975a-438a-9e2b-a585cbe769b5\",\"pf_rd_i\":\"desktop-dp-sims\",\"pd_rd_w\":\"ktYgt\"}","faceoutTemplateClass":"PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout","auiDeviceType":"desktop","imageWidth":160,"schemaVersion":2,"productDetailsTemplateClass":"PI::P13N::ViewTemplates::ProductDetails::Desktop::Base","forceFreshWin":0,"productDataFlavor":"Faceout","relatedRequestID":"X5Z293FJ403CC225M759","maxLineCount":6},"id_list":["B07BHS22V6:","B00ITJNHX6:","B07DDGCLZ1:","B017XYQ4X2:","B01LYA8CLG:","B0747T62HS:","B00LHT0I78:","B071D5LL18:","B071NPLTRS:","B00CFMRFO0:","B01N4X1EL9:","B077R4WZ46:","B00YTZSTVY:","B073V5T8G2:","B00CFMRI7E:","B01ARIYIPM:","B0747X16FY:","B00ZWNPJVA:","B01N4WZ4AL:","B00BU662AU:","B07C2NYVMP:","B01FD7ZOB4:","B017M17VTC:","B00YTZST0K:","B07CVSJG6H:","B00V63GQBC:","B00NYBAJJY:","B01MCZ2ZQC:","B078BSJ8TV:","B077QXWJBR:","B07BL5FWVP:","B00N8SPSSU:","B01LXMVFGI:","B06ZY83D2Z:","B00ZQYY9TI:","B0761HT6JJ:","B06XRWB686:","B075XHDQ85:","B01LYJMK02:","B018JWYKRE:","B0759W61P6:","B078ZKNGRS:","B013BJBZBE:","B01LYMTVY2:","B072VMTVGZ:","B077QXW1Z9:","B07CMB96BX:","B07BNXNMZ5:","B01N3CY4Y3:","B018JX3J7U:","B0747T5MY1:","B07CQPTFDB:","B077QW292J:","B00LHT0GLQ:","B01C4B17XG:","B019WD74F4:"],"url":"/gp/p13n-shared/faceout-partial","id_param_name":"asins"},"baseAsin":"B01LS24R2U","name":"desktop-dp-sims_session-similarities","set_size":56}
just use pythons json library
import json
j1 = """{
"ajax": {
"params": {
"asinMetadataKeys": "adId",
"featureId": "SimilaritiesCarousel",
"reftagPrefix": "pd_sbs_60",
"widgetTemplateClass": "PI::Similarities::ViewTemplates::Carousel::Desktop",
"imageHeight": 160,
"faceoutTemplateClass": "PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout",
"auiDeviceType": "desktop",
"imageWidth": 160,
"schemaVersion": 2,
"productDetailsTemplateClass": "PI::P13N::ViewTemplates::ProductDetails::Desktop::Base",
"forceFreshWin": 0,
"productDataFlavor": "Faceout",
"relatedRequestID": "H21WNBAW5EGZX90ND4PN",
"maxLineCount": 6
},
"id_list": ["B01M8QSY16:", "B017XBDBI6:", "B01GL5MYCE:", "B0751DHYXC:", "B01AHWOH54:", "B01M7XYENW:", "B01N7FKKXV:", "B07C1NLKS5:", "B00R25QZDC:", "B01AJB1VFW:", "B079K773M7:", "B07DX3W41P:", "B01GL5606A:", "B07654YLSB:", "B01GFL6MZE:", "B00WLI5E3M:", "B01CTE28DG:", "B01BELELVC:", "B00ZY7H91M:", "B077TPG2WK:", "B01G503MC6:", "B01LYZFC4V:", "B00ID9UQYK:", "B07C3T52LB:", "B07DX39RNS:", "B076551MZP:", "B0761RWKPQ:", "B00T8FD9YM:", "B07653JBYS:", "B07G316H74:", "B01FSEBC9K:", "B014QKBVH0:", "B01BVA2I4S:", "B01CVOZNAE:", "B07D19JDH9:", "B018ACDMJK:", "B00V0H83YW:", "B07C432PK3:", "B07B9P4T4V:", "B076H4WWLK:", "B077G3Y86F:", "B077Z7XLJF:", "B01NCFB2BB:", "B01M4I7FMC:", "B01BEVFJCM:", "B01FSEBC8G:", "B07DXCTKB6:", "B01NBHYAR0:", "B07DGWJ887:", "B00SLP58SU:", "B01N55H5AE:", "B013AZCPLS:", "B076PC3NYV:", "B01BVA2JHE:", "B07FF38J8C:", "B07DHGTS81:", "B00R25QZHS:"],
"url": "/gp/p13n-shared/faceout-partial",
"id_param_name": "asins"
},
"baseAsin": "B01GL56060",
"name": "desktop-dp-sims_session-similarities",
"set_size": 57
}"""
d1 = json.loads(j1)
id_list = [elem.replace(":", "") for elem in d1["ajax"]['id_list']]
id_list
Output:
['B01M8QSY16',
'B017XBDBI6',
...
'B00R25QZHS']
I had to remove the line "linkGetParameters : ... " because it seems to be not json conform.
If you are sure that the attribute "id_list" will always be in one line in a similar single-space format after commas and colon, and the json module is not an option, then you can do the following:
list( # make sure the result is a list
filter( # filter to…
None, # …remove any empty items
re.split( # split the line of id_list on…
r':(?:,\s)?', # …colon and then optional comma and spaces
re.search( # search…
r'(?<="id_list": \[)((?:"[^"]+:"(?:,\s*)?)+)', j1) # …for the id_list property and its value
.group(0) # take the match
.replace('"', '') # and drop all double quotes
)))
['B01M8QSY16', 'B017XBDBI6', 'B01GL5MYCE', 'B0751DHYXC', 'B01AHWOH54', 'B01M7XYENW', 'B01N7FKKXV', 'B07C1NLKS5', 'B00R25QZDC', 'B01AJB1VFW', 'B079K773M7', 'B07DX3W41P', 'B01GL5606A', 'B07654YLSB', 'B01GFL6MZE', 'B00WLI5E3M', 'B01CTE28DG', 'B01BELELVC', 'B00ZY7H91M', 'B077TPG2WK', 'B01G503MC6', 'B01LYZFC4V', 'B00ID9UQYK', 'B07C3T52LB', 'B07DX39RNS', 'B076551MZP', 'B0761RWKPQ', 'B00T8FD9YM', 'B07653JBYS', 'B07G316H74', 'B01FSEBC9K', 'B014QKBVH0', 'B01BVA2I4S', 'B01CVOZNAE', 'B07D19JDH9', 'B018ACDMJK', 'B00V0H83YW', 'B07C432PK3', 'B07B9P4T4V', 'B076H4WWLK', 'B077G3Y86F', 'B077Z7XLJF', 'B01NCFB2BB', 'B01M4I7FMC', 'B01BEVFJCM', 'B01FSEBC8G', 'B07DXCTKB6', 'B01NBHYAR0', 'B07DGWJ887', 'B00SLP58SU', 'B01N55H5AE', 'B013AZCPLS', 'B076PC3NYV', 'B01BVA2JHE', 'B07FF38J8C', 'B07DHGTS81', 'B00R25QZHS']
This is dense and mostly unreadable code; use as-is, or I can break down more readably the logic if you want.
Seeing as you can't use the JSON library, you can try this here expression (tested on Python3):
result = [ id.strip('":') for id in re.search('"id_list": \[(.*)\],', jsonstr).group(1).split(", ") ]
(where jsonstr is a string containing all of the original JSON code).
To make it easier to understand, the above code uses
re.search (not re.filterall as you had suggested) to broadly locate and select the line,
group to narrow down the selection,
split to transform the string into a list, and
strip to trim off the unnecessary characters in each list item
leaving you with a list of IDs like the one you specify in your question.
First, as Florian H stated. You should claim valid JSON from your source in order to be able to use the json Python module. Someone who provides JSON should provide valid JSON...
EDIT: The JSON seems valid, see below
Trying to use the json module anyway to address your need, I noted that the parsing problem comes from the escaped double-quote in linkGetParameters value.
I assume the JSON string has been copied/pasted as is and this is probably the source of the JSON parsing problem. Simply pasting this JSON in a Python string makes Python use the anti-slash to escape the double quote instead of preserving the two characters.
To test the JSON content, you have to copy it into a raw string (= prefixed by a r):
import json
json_ = r"""{
"ajax": {
"params": {
"asinMetadataKeys": "adId",
"featureId": "SimilaritiesCarousel",
"reftagPrefix": "pd_sbs_60",
"widgetTemplateClass": "PI::Similarities::ViewTemplates::Carousel::Desktop",
"imageHeight": 160,
"linkGetParameters": "{\"pf_rd_s\":\"desktop-dp-sims\",\"pf_rd_m\":\"A3JWKAKR8XB7XF\",\"pd_rd_r\":\"ac83cd73-b019-11e8-99c8-33d23753c678\",\"pf_rd_r\":\"H21WNBAW5EGZX90ND4PN\",\"pf_rd_t\":\"40701\",\"pd_rd_wg\":\"e6DPw\",\"pf_rd_p\":\"946762da-975a-438a-9e2b-a585cbe769b5\",\"pf_rd_i\":\"desktop-dp-sims\",\"pd_rd_w\":\"xg8TH\"}",
"faceoutTemplateClass": "PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout",
"auiDeviceType": "desktop",
"imageWidth": 160,
"schemaVersion": 2,
"productDetailsTemplateClass": "PI::P13N::ViewTemplates::ProductDetails::Desktop::Base",
"forceFreshWin": 0,
"productDataFlavor": "Faceout",
"relatedRequestID": "H21WNBAW5EGZX90ND4PN",
"maxLineCount": 6
},
"id_list": ["B01M8QSY16:", "B017XBDBI6:", "B01GL5MYCE:", "B0751DHYXC:", "B01AHWOH54:", "B01M7XYENW:", "B01N7FKKXV:", "B07C1NLKS5:", "B00R25QZDC:", "B01AJB1VFW:", "B079K773M7:", "B07DX3W41P:", "B01GL5606A:", "B07654YLSB:", "B01GFL6MZE:", "B00WLI5E3M:", "B01CTE28DG:", "B01BELELVC:", "B00ZY7H91M:", "B077TPG2WK:", "B01G503MC6:", "B01LYZFC4V:", "B00ID9UQYK:", "B07C3T52LB:", "B07DX39RNS:", "B076551MZP:", "B0761RWKPQ:", "B00T8FD9YM:", "B07653JBYS:", "B07G316H74:", "B01FSEBC9K:", "B014QKBVH0:", "B01BVA2I4S:", "B01CVOZNAE:", "B07D19JDH9:", "B018ACDMJK:", "B00V0H83YW:", "B07C432PK3:", "B07B9P4T4V:", "B076H4WWLK:", "B077G3Y86F:", "B077Z7XLJF:", "B01NCFB2BB:", "B01M4I7FMC:", "B01BEVFJCM:", "B01FSEBC8G:", "B07DXCTKB6:", "B01NBHYAR0:", "B07DGWJ887:", "B00SLP58SU:", "B01N55H5AE:", "B013AZCPLS:", "B076PC3NYV:", "B01BVA2JHE:", "B07FF38J8C:", "B07DHGTS81:", "B00R25QZHS:"],
"url": "/gp/p13n-shared/faceout-partial",
"id_param_name": "asins"
},
"baseAsin": "B01GL56060",
"name": "desktop-dp-sims_session-similarities",
"set_size": 57
}"""
result = json.loads(json_)
print [id_[:-1] for id_ in result['ajax']['id_list']]
# [u'B01M8QSY16', u'B017XBDBI6', u'B01GL5MYCE', u'B0751DHYXC', u'B01AHWOH54', u'B01M7XYENW', u'B01N7FKKXV', u'B07C1NLKS5', u'B00R25QZDC', u'B01AJB1VFW', u'B079K773M7', u'B07DX3W41P', u'B01GL5606A', u'B07654YLSB', u'B01GFL6MZE', u'B00WLI5E3M', u'B01CTE28DG', u'B01BELELVC', u'B00ZY7H91M', u'B077TPG2WK', u'B01G503MC6', u'B01LYZFC4V', u'B00ID9UQYK', u'B07C3T52LB', u'B07DX39RNS', u'B076551MZP', u'B0761RWKPQ', u'B00T8FD9YM', u'B07653JBYS', u'B07G316H74', u'B01FSEBC9K', u'B014QKBVH0', u'B01BVA2I4S', u'B01CVOZNAE', u'B07D19JDH9', u'B018ACDMJK', u'B00V0H83YW', u'B07C432PK3', u'B07B9P4T4V', u'B076H4WWLK', u'B077G3Y86F', u'B077Z7XLJF', u'B01NCFB2BB', u'B01M4I7FMC', u'B01BEVFJCM', u'B01FSEBC8G', u'B07DXCTKB6', u'B01NBHYAR0', u'B07DGWJ887', u'B00SLP58SU', u'B01N55H5AE', u'B013AZCPLS', u'B076PC3NYV', u'B01BVA2JHE', u'B07FF38J8C', u'B07DHGTS81', u'B00R25QZHS']
Once the id_list retrieved, you can remove the last character of each id using the string slicing.
When using JSON content from your original source instead of a litteral string, you should not encounter this kind of escaping problem.
If it is really not possible, assuming an id is always 10 characters long, this should do the trick:
import re
json = """{
"ajax": {
"params": {
"asinMetadataKeys": "adId",
"featureId": "SimilaritiesCarousel",
"reftagPrefix": "pd_sbs_60",
"widgetTemplateClass": "PI::Similarities::ViewTemplates::Carousel::Desktop",
"imageHeight": 160,
"linkGetParameters": "{\"pf_rd_s\":\"desktop-dp-sims\",\"pf_rd_m\":\"A3JWKAKR8XB7XF\",\"pd_rd_r\":\"ac83cd73-b019-11e8-99c8-33d23753c678\",\"pf_rd_r\":\"H21WNBAW5EGZX90ND4PN\",\"pf_rd_t\":\"40701\",\"pd_rd_wg\":\"e6DPw\",\"pf_rd_p\":\"946762da-975a-438a-9e2b-a585cbe769b5\",\"pf_rd_i\":\"desktop-dp-sims\",\"pd_rd_w\":\"xg8TH\"}",
"faceoutTemplateClass": "PI::P13N::ViewTemplates::Product::Desktop::CarouselFaceout",
"auiDeviceType": "desktop",
"imageWidth": 160,
"schemaVersion": 2,
"productDetailsTemplateClass": "PI::P13N::ViewTemplates::ProductDetails::Desktop::Base",
"forceFreshWin": 0,
"productDataFlavor": "Faceout",
"relatedRequestID": "H21WNBAW5EGZX90ND4PN",
"maxLineCount": 6
},
"id_list": ["B01M8QSY16:", "B017XBDBI6:", "B01GL5MYCE:", "B0751DHYXC:", "B01AHWOH54:", "B01M7XYENW:", "B01N7FKKXV:", "B07C1NLKS5:", "B00R25QZDC:", "B01AJB1VFW:", "B079K773M7:", "B07DX3W41P:", "B01GL5606A:", "B07654YLSB:", "B01GFL6MZE:", "B00WLI5E3M:", "B01CTE28DG:", "B01BELELVC:", "B00ZY7H91M:", "B077TPG2WK:", "B01G503MC6:", "B01LYZFC4V:", "B00ID9UQYK:", "B07C3T52LB:", "B07DX39RNS:", "B076551MZP:", "B0761RWKPQ:", "B00T8FD9YM:", "B07653JBYS:", "B07G316H74:", "B01FSEBC9K:", "B014QKBVH0:", "B01BVA2I4S:", "B01CVOZNAE:", "B07D19JDH9:", "B018ACDMJK:", "B00V0H83YW:", "B07C432PK3:", "B07B9P4T4V:", "B076H4WWLK:", "B077G3Y86F:", "B077Z7XLJF:", "B01NCFB2BB:", "B01M4I7FMC:", "B01BEVFJCM:", "B01FSEBC8G:", "B07DXCTKB6:", "B01NBHYAR0:", "B07DGWJ887:", "B00SLP58SU:", "B01N55H5AE:", "B013AZCPLS:", "B076PC3NYV:", "B01BVA2JHE:", "B07FF38J8C:", "B07DHGTS81:", "B00R25QZHS:"],
"url": "/gp/p13n-shared/faceout-partial",
"id_param_name": "asins"
},
"baseAsin": "B01GL56060",
"name": "desktop-dp-sims_session-similarities",
"set_size": 57
}"""
# https://regex101.com/r/qxYe9N/11
id_re = re.compile('"([A-Z0-9]{10}):"')
result = id_re.findall(json)
print result
# ['B01M8QSY16', 'B017XBDBI6', 'B01GL5MYCE', 'B0751DHYXC', 'B01AHWOH54', 'B01M7XYENW', 'B01N7FKKXV', 'B07C1NLKS5', 'B00R25QZDC', 'B01AJB1VFW', 'B079K773M7', 'B07DX3W41P', 'B01GL5606A', 'B07654YLSB', 'B01GFL6MZE', 'B00WLI5E3M', 'B01CTE28DG', 'B01BELELVC', 'B00ZY7H91M', 'B077TPG2WK', 'B01G503MC6', 'B01LYZFC4V', 'B00ID9UQYK', 'B07C3T52LB', 'B07DX39RNS', 'B076551MZP', 'B0761RWKPQ', 'B00T8FD9YM', 'B07653JBYS', 'B07G316H74', 'B01FSEBC9K', 'B014QKBVH0', 'B01BVA2I4S', 'B01CVOZNAE', 'B07D19JDH9', 'B018ACDMJK', 'B00V0H83YW', 'B07C432PK3', 'B07B9P4T4V', 'B076H4WWLK', 'B077G3Y86F', 'B077Z7XLJF', 'B01NCFB2BB', 'B01M4I7FMC', 'B01BEVFJCM', 'B01FSEBC8G', 'B07DXCTKB6', 'B01NBHYAR0', 'B07DGWJ887', 'B00SLP58SU', 'B01N55H5AE', 'B013AZCPLS', 'B076PC3NYV', 'B01BVA2JHE', 'B07FF38J8C', 'B07DHGTS81', 'B00R25QZHS']