I'm trying to edit some dictionary values using Regular Expression and change them back into a dict object.
I am checking IP data using Shodan API, this returns a set of results, more than I need.
Shodan Ip
ipinfo {'city': None, 'region_code': None, 'os': None, 'tags': ['vpn'], 'ip': 771247238, 'isp': 'Host Universal Pty', 'area_code': None, 'dma_code': None, 'last_update': '2019-05-01T06:53:53.130508', 'country_code3': 'AUS', 'country_name': 'Australia', 'hostnames': [], 'postal_code': None, 'longitude': 143.2104, 'country_code': 'AU', 'ip_str': '45.248.76.134', 'latitude': -33.494, 'org': 'Host Universal Pty', 'data': [{'_shodan': {'id': 'bc2dc252-5b9d-4b3d-975f-0156860c8849', 'options': {}, 'ptr': True, 'module': 'https', 'crawler': '65e79faecee26516a8ed6f16c1142432f303fbdc'}, 'hash': 0, 'os': None, 'opts': {}, 'ip': 771247238, 'isp': 'Host Universal Pty', 'port': 443, 'hostnames': [], 'location': {'city': None, 'region_code': None, 'area_code': None, 'longitude': 143.2104, 'country_code3': 'AUS', 'country_name': 'Australia', 'postal_code': None, 'dma_code': None, 'country_code': 'AU', 'latitude': -33.494}, 'timestamp': '2019-05-01T06:53:53.130508', 'domains': [], 'org': 'Host Universal Pty', 'data': '', 'asn': 'AS136557', 'transport': 'tcp', 'ip_str': '45.248.76.134'}, {'_shodan': {'id': 'cdce36e7-588f-4377-8cc6-f9bedd426e6b', 'options': {}, 'ptr': True, 'module': 'https', 'crawler': '0636e1e6dd371760aeaf808ed839236e73a9e74d'}, 'hash': 0, 'os': None, 'opts': {}, 'ip': 771247238, 'isp': 'Host Universal Pty', 'port': 8443, 'hostnames': [], 'location': {'city': None, 'region_code': None, 'area_code': None, 'longitude': 143.2104, 'country_code3': 'AUS', 'country_name': 'Australia', 'postal_code': None, 'dma_code': None, 'country_code': 'AU', 'latitude': -33.494}, 'timestamp': '2019-04-26T18:31:18.138759', 'domains': [], 'org': 'Host Universal Pty', 'data': '', 'asn': 'AS136557', 'transport': 'tcp', 'ip_str': '45.248.76.134'}, {'_shodan': {'id': '27e5f5e0-662e-4621-b043-56d64d25f38d', 'options': {}, 'ptr': True, 'module': 'http', 'crawler': 'c9b639b99e5410a46f656e1508a68f1e6e5d6f99'}, 'hash': 0, 'os': None, 'opts': {}, 'ip': 771247238, 'isp': 'Host Universal Pty', 'http': {'robots_hash': None, 'redirects': [], 'securitytxt': None, 'title': None, 'sitemap_hash': None, 'robots': None, 'server': None, 'host': '45.248.76.134', 'html': None, 'location': '/', 'html_hash': None, 'sitemap': None, 'securitytxt_hash': None}, 'port': 8080, 'hostnames': [], 'location': {'city': None, 'region_code': None, 'area_code': None, 'longitude': 143.2104, 'country_code3': 'AUS', 'country_name': 'Australia', 'postal_code': None, 'dma_code': None, 'country_code': 'AU', 'latitude': -33.494}, 'timestamp': '2019-04-21T03:00:14.986062', 'domains': [], 'org': 'Host Universal Pty', 'data': '', 'asn': 'AS136557', 'transport': 'tcp', 'ip_str': '45.248.76.134'}, {'_shodan': {'id': 'bfbc3556-d00d-4512-8cb3-32ef6cae9964', 'options': {}, 'ptr': True, 'module': 'ike', 'crawler': '8cd926590a400feb4b683f8337a77287ddf3d2c7'}, 'hash': -451677272, 'os': None, 'tags': ['vpn'], 'opts': {'raw': '61713862726c6c3764627037343033792920252800000000000000240000000800000005'}, 'ip': 771247238, 'isp': 'Host Universal Pty', 'port': 500, 'isakmp': {'initiator_spi': '61713862726c6c37', 'responder_spi': '6462703734303379', 'msg_id': '00000000', 'next_payload': 41, 'exchange_type': 37, 'length': 36, 'version': '2.0', 'flags': {'encryption': False, 'authentication': False, 'commit': False}, 'aggressive': {'initiator_spi': 'a6517b6a97dca862', 'responder_spi': '1655d8123c9f2104', 'msg_id': 'd14144c6', 'next_payload': 11, 'exchange_type': 5, 'length': 40, 'version': '1.0', 'flags': {'encryption': False, 'authentication': False, 'commit': False}, 'vendor_ids': []}, 'vendor_ids': []}, 'hostnames': [], 'location': {'city': None, 'region_code': None, 'area_code': None, 'longitude': 143.2104, 'country_code3': 'AUS', 'country_name': 'Australia', 'postal_code': None, 'dma_code': None, 'country_code': 'AU', 'latitude': -33.494}, 'timestamp': '2019-04-13T11:18:42.166709', 'domains': [], 'org': 'Host Universal Pty', 'data': 'VPN (IKE)\n\nInitiator SPI: 61713862726c6c37\nResponder SPI: 6462703734303379\nNext Payload: RESERVED\nVersion: 2.0\nExchange Type: DOI Specific Use\nFlags:\n Encryption: False\n Commit: False\n Authentication: False\nMessage ID: 00000000\nLength: 36', 'asn': 'AS136557', 'transport': 'udp', 'ip_str': '45.248.76.134'}], 'asn': 'AS136557', 'ports': [443, 8443, 8080, 500]}
I use Regular Expression to delete the data I don't need. This deletes anything from the fields' data' onwards.
osint_ip1 = re.sub("..'data':.*", "}", str(ipinfo))
Heres the problem... Since Showdan returns inconsistently depending on the IP, I need to use a Dirctwriter to write the corresponding values to there fields.
The problem with this is I have to cast ipinfo as a string to edit the data, and the string object canot be used in the CSV Directwriter.
How do I turn the string back into direction format?
OSINT(STRING)
rejoin : ["{'city': None", " 'region_code': None", " 'os': None", " 'tags': ['vpn']", " 'ip': 771247238", " 'isp': 'Host Universal Pty'", " 'area_code': None", " 'dma_code': None", " 'last_update': '2019-05-01T06:53:53.130508'", " 'country_code3': 'AUS'", " 'country_name': 'Australia'", " 'hostnames': []", " 'postal_code': None", " 'longitude': 143.2104", " 'country_code': 'AU'", " 'ip_str': '45.248.76.134'", " 'latitude': -33.494", " 'org': 'Host Universal Pty'}"]
Full code below
import csv
import os
import re
import time
import shodan
from shodan import Shodan
def OPSINT():
for x in ip:
print(x)
try:
ipinfo = api.host(x)
except shodan.exception.APIError:
ipinfo = None
pass
filename = 'C:\\ProgramData\\FDA\\output\\processed\\OSINT.csv'
if ipinfo != None:
osint_ip1 = re.sub("..'data':.*", "}", str(ipinfo))
osint_ip = osint_ip1.split(',')
print("rejoin :", osint_ip)
# print(osint_ip)
print("ipinfo", ipinfo)
with open("C:\\ProgramData\\FDA\\output\\processed\\OSINT.csv", 'a') as csvfile:
fieldnames = ['city', 'region_code', 'os', 'tags', 'ip', 'isp', 'area_code', 'dma_code', 'last_update',
'country_code3', 'country_name', 'hostnames', 'postal_code', 'longitude', 'country_code',
'ip_str', 'latitude', 'org']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
fileEmpty = os.stat(filename).st_size == 0
if fileEmpty:
writer.writeheader()
else:
writer.writerows(osint_ip)
csvfile.close()
A dict is a well-structured data type meant to be accessed and/or manipulated via keys. Manipulating a dict by converting it to a string and performing regex substitution is wholly unnecessary and error-prone.
Since you've already defined the exact keys you want as fieldnames, you can simply use operator.itemgetter to get the values of these keys from the dict ipinfo, and zip them with the key names to construct a new dict for csv.DictWriter.writerow to write from:
from operator import itemgetter
...
writer.writerow(dict(zip(fieldnames, itemgetter(*fieldnames)(ipinfo))))
Related
I have the following code. I am trying to access to https://api.github.com/users/jtorre94 via the requests library.
import requests
api_url = "https://api.github.com/users"
response = requests.get(api_url, params={'login': 'jtorre94'})
response.json()
However, the response is something I do not recognize at all, like if it was not filtered by the jtorre94 parameter.
[{'login': 'mojombo',
'id': 1,
'node_id': 'MDQ6VXNlcjE=',
'avatar_url': 'https://avatars.githubusercontent.com/u/1?v=4',
'gravatar_id': '',
'url': 'https://api.github.com/users/mojombo',
'html_url': 'https://github.com/mojombo',
'followers_url': 'https://api.github.com/users/mojombo/followers',
'following_url': 'https://api.github.com/users/mojombo/following{/other_user}',
'gists_url': 'https://api.github.com/users/mojombo/gists{/gist_id}',
'starred_url': 'https://api.github.com/users/mojombo/starred{/owner}{/repo}',
'subscriptions_url': 'https://api.github.com/users/mojombo/subscriptions',
'organizations_url': 'https://api.github.com/users/mojombo/orgs',
'repos_url': 'https://api.github.com/users/mojombo/repos',
'events_url': 'https://api.github.com/users/mojombo/events{/privacy}',
'received_events_url': 'https://api.github.com/users/mojombo/received_events',
'type': 'User',
'site_admin': False},
{'login': 'defunkt',
'id': 2,
'node_id': 'MDQ6VXNlcjI=',
'avatar_url': 'https://avatars.githubusercontent.com/u/2?v=4',
'gravatar_id': '',
'url': 'https://api.github.com/users/defunkt',
'html_url': 'https://github.com/defunkt',
'followers_url': 'https://api.github.com/users/defunkt/followers',
'following_url': 'https://api.github.com/users/defunkt/following{/...
How can I retrieve the json for username jtorre94?
Append it to the url as you already tried with your browser:
import requests
user = 'jtorre94'
api_url = f"https://api.github.com/users/{user}"
response = requests.get(api_url)
response.json()
Output:
{'login': 'jtorre94',
'id': 76944588,
'node_id': 'MDQ6VXNlcjc2OTQ0NTg4',
'avatar_url': 'https://avatars.githubusercontent.com/u/76944588?v=4',
'gravatar_id': '',
'url': 'https://api.github.com/users/jtorre94',
'html_url': 'https://github.com/jtorre94',
'followers_url': 'https://api.github.com/users/jtorre94/followers',
'following_url': 'https://api.github.com/users/jtorre94/following{/other_user}',
'gists_url': 'https://api.github.com/users/jtorre94/gists{/gist_id}',
'starred_url': 'https://api.github.com/users/jtorre94/starred{/owner}{/repo}',
'subscriptions_url': 'https://api.github.com/users/jtorre94/subscriptions',
'organizations_url': 'https://api.github.com/users/jtorre94/orgs',
'repos_url': 'https://api.github.com/users/jtorre94/repos',
'events_url': 'https://api.github.com/users/jtorre94/events{/privacy}',
'received_events_url': 'https://api.github.com/users/jtorre94/received_events',
'type': 'User',
'site_admin': False,
'name': None,
'company': None,
'blog': '',
'location': None,
'email': None,
'hireable': None,
'bio': None,
'twitter_username': None,
'public_repos': 4,
'public_gists': 0,
'followers': 0,
'following': 0,
'created_at': '2021-01-04T10:11:25Z',
'updated_at': '2022-07-23T11:17:18Z'}
I am trying to pick Instances in the json objects data which looks like this
[{'Groups': [], 'Instances': [{'AmiLaunchIndex': 0, 'ImageId': 'ami-0ceecbb0f30a902a6', 'InstanceId': 'i-xxxxx', 'InstanceType': 't2.micro', 'KeyName': 'xxxx', 'LaunchTime': {'$date': '2022-12-17T13:07:54Z'}, 'Monitoring': {'State': 'disabled'}, 'Placement': {'AvailabilityZone': 'us-west-2b', 'GroupName': '', 'Tenancy': 'default'}, 'PrivateDnsName': 'ip-zxxxxx.us-west-2.compute.internal', 'PrivateIpAddress': 'xxxxx', 'ProductCodes': [], 'PublicDnsName': 'ec2-xx-xxx-xxx.us-west-2.compute.amazonaws.com', 'PublicIpAddress': 'xxxxxx', 'State': {'Code': 16, 'Name': 'running'}, 'StateTransitionReason': '', 'SubnetId': 'subnet-xxxxx', 'VpcId': 'vpc-xxxxx', 'Architecture': 'x86_64', 'BlockDeviceMappings': [{'DeviceName': '/dev/xvda', 'Ebs': {'AttachTime': {'$date': '2022-12-17T13:07:55Z'}, 'DeleteOnTermination': True, 'Status': 'attached', 'VolumeId': 'vol-xxxx'}}], 'ClientToken': '529fc1ac-bf64-4804-b0b8-7c7778ace68c', 'EbsOptimized': False, 'EnaSupport': True, 'Hypervisor': 'xen', 'NetworkInterfaces': [{'Association': {'IpOwnerId': 'amazon', 'PublicDnsName': 'ec2-35-86-111-31.us-west-2.compute.amazonaws.com', 'PublicIp': 'xxxxx'}, 'Attachment': {'AttachTime': {'$date': '2022-12-17T13:07:54Z'}, 'AttachmentId': 'eni-attach-0cac7d4af20664b23', 'DeleteOnTermination': True, 'DeviceIndex': 0, 'Status': 'attached', 'NetworkCardIndex': 0}, 'Description': '', 'Groups': [{'GroupName': 'launch-wizard-5', 'GroupId': 'sg-xxxxx'}], 'Ipv6Addresses': [], 'MacAddress': 'xxxxx', 'NetworkInterfaceId': 'eni-xxxxx', 'OwnerId': 'xxxx', 'PrivateDnsName': 'ip-xxxxx.us-west-2.compute.internal', 'PrivateIpAddress': 'xxx.xxx.xxx', 'PrivateIpAddresses': [{'Association': {'IpOwnerId': 'amazon', 'PublicDnsName': 'ec2-xx-xx-xx-xxx.us-west-2.compute.amazonaws.com', 'PublicIp': 'xxx.xxx.xxx'}, 'Primary': True, 'PrivateDnsName': 'ip-172-31-20-187.us-west-2.compute.internal', 'PrivateIpAddress': 'xxx.xxx.xxx'}], 'SourceDestCheck': True, 'Status': 'in-use', 'SubnetId': 'subnet-xxxxxxx', 'VpcId': 'vpc-0b09cd4sedxxx', 'InterfaceType': 'interface'}], 'RootDeviceName': '/dev/xvda', 'RootDeviceType': 'ebs', 'SecurityGroups': [{'GroupName': 'launch-wizard-5', 'GroupId': 'sg-0a0d1c79d8076660e'}], 'SourceDestCheck': True, 'Tags': [{'Key': 'Name', 'Value': 'MainServers'}], 'VirtualizationType': 'hvm', 'CpuOptions': {'CoreCount': 1, 'ThreadsPerCore': 1}, 'CapacityReservationSpecification': {'CapacityReservationPreference': 'open'}, 'HibernationOptions': {'Configured': False}, 'MetadataOptions': {'State': 'applied', 'HttpTokens': 'optional', 'HttpPutResponseHopLimit': 1, 'HttpEndpoint': 'enabled', 'HttpProtocolIpv6': 'disabled', 'InstanceMetadataTags': 'disabled'}, 'EnclaveOptions': {'Enabled': False}, 'PlatformDetails': 'Linux/UNIX', 'UsageOperation': 'RunInstances', 'UsageOperationUpdateTime': {'$date': '2022-12-17T13:07:54Z'}, 'PrivateDnsNameOptions': {'HostnameType': 'ip-name', 'EnableResourceNameDnsARecord': True, 'EnableResourceNameDnsAAAARecord': False}, 'MaintenanceOptions': {'AutoRecovery': 'default'}}], 'OwnerId': '76979cfxdsss11', 'ReservationId': 'r-xxxxx'}]
I tired loading data and doing
resp = json.loads(jsonfile)
reqData= resp['Instances']
But getting error
TypeError: list indices must be integers or slices, not str
Is there any way I can fix this and get the data? Help will be extremely appriciated.
It's wrapped inside a list. So simply do:
print(lst[0]["Instances"])
To select only the instances from the data, you can use the json.loads function to parse the JSON data and extract the Instances field as a list.
import json
# Parse the JSON data
data = json.loads(json_data)
# Extract the instances
instances = data['Instances']
You can then iterate over the data with something like this
for instance in instances:
instance_id = instance['InstanceId']
instance_type = instance['InstanceType']
launch_time = instance['LaunchTime']
I want to pull a report which is over 2000 rows from Salesforce via API using python. How do I update the post request to send the updated metadata with the new filters in order to get the next 2000 rows of data? Here is the code I have, but the response of the post-request has the same exact filters as before. What am I doing wrong here?
Excerpt of Code:
headers = {
'Content-type': 'application/json',
'Accept-Encoding': 'gzip',
'Authorization': 'Bearer %s' % access_token
}
parameters={}
descripion = requests.request('get', instance_url+'/services/data/v51.0/analytics/reports/00O4Q000009VEPCUA4/describe',
headers=headers, params=parameters, timeout=30).json()
orig_metadata = descripion['reportMetadata']
id_column='CUST_NAME'
last_load_num='162451'
sf_id_column = descripion['reportExtendedMetadata']['detailColumnInfo'][id_column]['label']
print(sf_id_column)
metadata = {
'reportBooleanFilter': '({}) AND {}'.format(orig_metadata['reportBooleanFilter'],
len(orig_metadata['reportFilters']) + 1),
'reportFilters': orig_metadata['reportFilters']+[{'column':id_column,
'filterType': 'fieldValue',
'isRunPageEditable': True,
'operator': 'greaterThan',
'value': last_load_num}],
'standardDateFilter':[{'column': 'CUST_CREATED_DATE','durationValue': 'CUSTOM',
'endDate': '2021-07-14','startDate': '2021-07-01'}],
'detailColumns': orig_metadata['detailColumns'][:],
'sortBy': [{'sortColumn': id_column, 'sortOrder': 'Asc'}],
}
r=requests.request('post', instance_url+'/services/data/v51.0/analytics/reports/00O4Q000009VEPCUA4',
headers=headers, params={'metadata':metadata}, timeout=30).json()
Here is what's in the original metadata:
{'aggregates': ['s!rtms__Load__c.rtms__Carrier_Quote_Total__c', 'RowCount'],
'chart': None,
'crossFilters': [],
'currency': None,
'dashboardSetting': None,
'description': None,
'detailColumns': ['CUST_NAME',
'CUST_CREATED_DATE',
'rtms__Load__c.rtms__Expected_Ship_Date2__c',
'rtms__Load__c.rtms__Load_Status__c',
'rtms__Load__c.rtms__Total_Weight__c',
'rtms__Load__c.rtms__Equipment_Type__c',
'rtms__Load__c.rtms__Origin__c',
'rtms__Load__c.rtms__Destination__c',
'rtms__Load__c.rtms__Zip3_Lane__c',
'rtms__Load__c.rtms__Zip5_Lane__c',
'rtms__Load__c.rtms__Carrier_Quote_Total__c',
'rtms__Load__c.rtms__Customer_Quote_Total__c'],
'developerName': 'Adel_Past_Shipment_Test_Pricing_Tool',
'division': None,
'folderId': '00l1U000000eXWwQAM',
'groupingsAcross': [],
'groupingsDown': [],
'hasDetailRows': True,
'hasRecordCount': True,
'historicalSnapshotDates': [],
'id': '00O4Q000009VEPCUA4',
'name': 'Adel Past Shipment Test Pricing Tool',
'presentationOptions': {'hasStackedSummaries': True},
'reportBooleanFilter': None,
'reportFilters': [{'column': 'rtms__Load__c.rtms__Customer__c',
'filterType': 'fieldValue',
'isRunPageEditable': True,
'operator': 'contains',
'value': 'adel'},
{'column': 'rtms__Load__c.rtms__Load_Status__c',
'filterType': 'fieldValue',
'isRunPageEditable': True,
'operator': 'notContain',
'value': 'cancelled'}],
'reportFormat': 'TABULAR',
'reportType': {'label': 'Loads', 'type': 'CustomEntity$rtms__Load__c'},
'scope': 'organization',
'showGrandTotal': True,
'showSubtotals': True,
'sortBy': [{'sortColumn': 'CUST_CREATED_DATE', 'sortOrder': 'Desc'}],
'standardDateFilter': {'column': 'CUST_CREATED_DATE',
'durationValue': 'CUSTOM',
'endDate': None,
'startDate': None},
'standardFilters': None,
'supportsRoleHierarchy': False,
'userOrHierarchyFilterId': None}
And here is what's in r['reportMetadata']:
{'aggregates': ['s!rtms__Load__c.rtms__Carrier_Quote_Total__c', 'RowCount'],
'chart': None,
'crossFilters': [],
'currency': None,
'dashboardSetting': None,
'description': None,
'detailColumns': ['CUST_NAME',
'CUST_CREATED_DATE',
'rtms__Load__c.rtms__Expected_Ship_Date2__c',
'rtms__Load__c.rtms__Load_Status__c',
'rtms__Load__c.rtms__Total_Weight__c',
'rtms__Load__c.rtms__Equipment_Type__c',
'rtms__Load__c.rtms__Origin__c',
'rtms__Load__c.rtms__Destination__c',
'rtms__Load__c.rtms__Zip3_Lane__c',
'rtms__Load__c.rtms__Zip5_Lane__c',
'rtms__Load__c.rtms__Carrier_Quote_Total__c',
'rtms__Load__c.rtms__Customer_Quote_Total__c'],
'developerName': 'Adel_Past_Shipment_Test_Pricing_Tool',
'division': None,
'folderId': '00l1U000000eXWwQAM',
'groupingsAcross': [],
'groupingsDown': [],
'hasDetailRows': True,
'hasRecordCount': True,
'historicalSnapshotDates': [],
'id': '00O4Q000009VEPCUA4',
'name': 'Adel Past Shipment Test Pricing Tool',
'presentationOptions': {'hasStackedSummaries': True},
'reportBooleanFilter': None,
'reportFilters': [{'column': 'rtms__Load__c.rtms__Customer__c',
'filterType': 'fieldValue',
'isRunPageEditable': True,
'operator': 'contains',
'value': 'adel'},
{'column': 'rtms__Load__c.rtms__Load_Status__c',
'filterType': 'fieldValue',
'isRunPageEditable': True,
'operator': 'notContain',
'value': 'cancelled'}],
'reportFormat': 'TABULAR',
'reportType': {'label': 'Loads', 'type': 'CustomEntity$rtms__Load__c'},
'scope': 'organization',
'showGrandTotal': True,
'showSubtotals': True,
'sortBy': [{'sortColumn': 'CUST_CREATED_DATE', 'sortOrder': 'Desc'}],
'standardDateFilter': {'column': 'CUST_CREATED_DATE',
'durationValue': 'CUSTOM',
'endDate': None,
'startDate': None},
'standardFilters': None,
'supportsRoleHierarchy': False,
'userOrHierarchyFilterId': None}
code image
I am currently using glom to parse through a JSON API response, which returns, among other things, a list of dictionaries, with a list of dictionaries inside it. The problem I'm having is getting glom to access the correct dictionary entry.
Example JSON:
{'answeredAt': '2019-08-23T21:11:04Z',
'direction': 'Inbound',
'disposition': 'Answered',
'duration': 110867,
'endedAt': '2019-08-23T21:12:55Z',
'from': {'connectedAt': '2019-08-23T21:11:04Z',
'departmentName': None,
'deviceType': None,
'disconnectedAt': '2019-08-23T21:12:55Z',
'name': 'blah',
'number': '1234567890',
'number_e164': '1234567890',
'serviceId': None,
'userId': None},
'initialQueueName': 'blah',
'joinedLinkedIds': [],
'legs': [{'departmentName': 'default',
'deviceType': 'Unknown',
'legType': 'Dial',
'menuName': None,
'menuOption': None,
'menuPrompt': None,
'number': '1234567890',
'optionAction': None,
'optionArg': None,
'queueName': None,
'serviceId': 327727,
'timestamp': '2019-08-23T21:11:04Z',
'userId': None},
{'departmentName': 'default',
'deviceType': 'Unknown',
'legType': 'Answer',
'menuName': None,
'menuOption': None,
'menuPrompt': None,
'number': '1234567890',
'optionAction': None,
'optionArg': None,
'queueName': None,
'serviceId': 327727,
'timestamp': '2019-08-23T21:11:04Z',
'userId': None},
{'departmentName': None,
'deviceType': None,
'legType': 'EnterIVR',
'menuName': 'blah',
'menuOption': None,
'menuPrompt': None,
'number': None,
'optionAction': None,
'optionArg': None,
'queueName': None,
'serviceId': None,
'timestamp': '2019-08-23T21:11:05Z',
'userId': None},
{'departmentName': None,
'deviceType': None,
'legType': 'IVRSchedule',
'menuName': 'Day',
'menuOption': None,
'menuPrompt': None,
'number': None,
'optionAction': None,
'optionArg': None,
'queueName': None,
'serviceId': None,
'timestamp': '2019-08-23T21:11:06Z',
'userId': None},
{'departmentName': None,
'deviceType': None,
'legType': 'EnterQueue',
'menuName': None,
'menuOption': None,
'menuPrompt': None,
'number': None,
'optionAction': None,
'optionArg': None,
'queueName': 'blah',
'serviceId': None,
'timestamp': '2019-08-23T21:11:15Z',
'userId': None},
{'departmentName': None,
'deviceType': None,
'legType': 'Hangup',
'menuName': None,
'menuOption': None,
'menuPrompt': None,
'number': 'blah',
'optionAction': None,
'optionArg': None,
'queueName': None,
'serviceId': None,
'timestamp': '2019-08-23T21:12:55Z',
'userId': None}],
'linkedId': 'some unique key',
'startedAt': '2019-08-23T21:11:04Z',
'to': {'connectedAt': '2019-08-23T21:11:04Z',
'departmentName': 'default',
'deviceType': 'Unknown',
'disconnectedAt': '2019-08-23T21:12:55Z',
'name': None,
'number': '1234567890',
'number_e164': '1234567890',
'serviceId': 327727,
'userId': None},
'version': {'label': None, 'major': 4, 'minor': 2, 'point': 1}},
The information I'm trying to get at is in 'legs', where 'legType' == 'Dial' or 'EnterIVR'. I need 'number' from the 'Dial' leg, and 'menuName' from the 'EnterIVR' leg. I can get it, for instance, to list back all the different legTypes, but not the data specifically from those.
This is where I'm at currently:
with open('callstest.csv',mode='w') as calls:
data_writer = csv.writer(calls, delimiter = ',')
data_writer.writerow(['LinkedID','Number','Queue','Client'])
target = response_json['calls']
glomtemp = {}
for item in target:
spec = {
'Linked ID':'linkedId',
#this returns the number I need only in certain cases,
#so I need 'number' from the 'Dial' legType
'Number': ('to', 'number')
'Queue': 'initialQueueName',
'Client': #need help here, should be 'menuName' from
#'EnterIVR' legType
}
glomtemp = glom(item,spec)
#print(glomtemp)
data_writer.writerow([glomtemp['Linked ID'],glomtemp['Number'],glomtemp['Queue']])
Right now I can get them to fall back with Coalesce to "None", but that's not what I'm looking for.
Any suggestions on how I should spec this to get the info out of those 2 legs for 'Number' and 'Client'?
If I understand correctly, you want to filter out certain entries that don't fit the supported legType. You're definitely onto something with the Coalesce, and I think the key here is glom's Check specifier type, combined with the SKIP singleton. I had to tweak your current spec a bit to match the example data, but this runs:
from glom import glom, Check, Coalesce, SKIP
LEG_SPEC = {'Client': Coalesce('menuName', default=''),
'Number': Coalesce('to.number', default=''),
'Linked ID': 'serviceId',
'Queue': 'queueName'}
entries_spec = ('legs',
[Check('legType', one_of=('Dial', 'EnterIVR'), default=SKIP)],
[LEG_SPEC])
pprint(glom(target, entries_spec))
# prints:
# [{'Client': None, 'Linked ID': 327727, 'Number': '', 'Queue': None},
# {'Client': 'blah', 'Linked ID': None, 'Number': '', 'Queue': None}]
Not sure if that was exactly what you were hoping to see, but the pattern is there. I think you want Nones (or '') for those other fields because the csv you're writing is going to want to put something in those columns.
There are other ways of doing filtered iteration using glom, too. The snippets page has a short section, complete with examples.
{'contributors': None,
'coordinates': None,
'created_at': 'Tue Aug 02 19:51:58 +0000 2016',
'entities': {'hashtags': [],
'symbols': [],
'urls': [],
'user_mentions': [{'id': 873491544,
'id_str': '873491544',
'indices': [0, 13],
'name': 'Kenel M',
'screen_name': 'KxSweaters13'}]},
'favorite_count': 1,
'favorited': False,
'geo': None,
'id': 760563814450491392,
'id_str': '760563814450491392',
'in_reply_to_screen_name': 'KxSweaters13',
'in_reply_to_status_id': None,
'in_reply_to_status_id_str': None,
'in_reply_to_user_id': 873491544,
'in_reply_to_user_id_str': '873491544',
'is_quote_status': False,
'lang': 'en',
'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},
'place': {'attributes': {},
'bounding_box': {'coordinates': [[[-71.813501, 42.4762],
[-71.702186, 42.4762],
[-71.702186, 42.573956],
[-71.813501, 42.573956]]],
'type': 'Polygon'},
'contained_within': [],
'country': 'Australia',
'country_code': 'AUS',
'full_name': 'Melbourne, V',
'id': 'c4f1830ea4b8caaf',
'name': 'Melbourne',
'place_type': 'city',
'url': 'https://api.twitter.com/1.1/geo/id/c4f1830ea4b8caaf.json'},
'retweet_count': 0,
'retweeted': False,
'source': 'Twitter for Android',
'text': '#KxSweaters13 are you the kenelx13 I see owning leominster for team valor?',
'truncated': False,
'user': {'contributors_enabled': False,
'created_at': 'Thu Apr 21 17:09:52 +0000 2011',
'default_profile': False,
'default_profile_image': False,
'description': "Arbys when it's cold. Kimballs when it's warm. #Ally__09 all year. Comp sci classes sometimes.",
'entities': {'description': {'urls': []}},
'favourites_count': 1106,
'follow_request_sent': None,
'followers_count': 167,
'following': None,
'friends_count': 171,
'geo_enabled': True,
'has_extended_profile': False,
'id': 285715182,
'id_str': '285715182',
'is_translation_enabled': False,
'is_translator': False,
'lang': 'en',
'listed_count': 2,
'location': 'MA',
'name': 'Steve',
'notifications': None,
'profile_background_color': '131516',
'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme14/bg.gif',
'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme14/bg.gif',
'profile_background_tile': True,
'profile_banner_url': 'https://pbs.twimg.com/profile_banners/285715182/1462218226',
'profile_image_url': 'http://pbs.twimg.com/profile_images/727223698332200961/bGPjGjHK_normal.jpg',
'profile_image_url_https': 'https://pbs.twimg.com/profile_images/727223698332200961/bGPjGjHK_normal.jpg',
'profile_link_color': '4A913C',
'profile_sidebar_border_color': 'FFFFFF',
'profile_sidebar_fill_color': 'EFEFEF',
'profile_text_color': '333333',
'profile_use_background_image': True,
'protected': False,
'screen_name': 'StephenBurke_',
'statuses_count': 5913,
'time_zone': 'Eastern Time (US & Canada)',
'url': None,
'utc_offset': -14400,
'verified': False}}
I have a json file which contains a list of json objects (each has the structure like above)
So I read it into a dataframe:
df = pd.read_json('data.json')
and then I try to get all the rows which are the 'city' type by:
df = df[df['place']['place_type'] == 'city']
but then I got the 'TypeError: an integer is required' During handling of the above exception, another exception occurred: KeyError: 'place_type'
Then I tried:
df['place'].head(3)
=>
0 {'id': '01864a8a64df9dc4', 'url': 'https://api...
1 {'id': '01864a8a64df9dc4', 'url': 'https://api...
2 {'id': '0118c71c0ed41109', 'url': 'https://api...
Name: place, dtype: object
So df['place'] return a series where keys are the indexes and that's why I got the TypeError
I've also tried to select the place_type of the first row and it works just fine:
df.iloc[0]['place']['place_type']
=>
city
The question is how can I filter out the rows in this case?
Solution:
Okay, so the problem lies in the fact that the pd.read_json cannot deal with nested JSON structure, so what I have done is to normalize the json object:
with open('data.json') as jsonfile:
data = json.load(jsonfile)
df = pd.io.json.json_normalize(data)
df = df[df['place.place_type'] == 'city']
You can use the a list comprehension to do the filtering you need.
df = [loc for loc in df if d['place']['place_type'] == 'city']
This will give you an array where the elements place_type is 'city'.
I don't know if you have to use the place_type that is the index, to show all the rows that contains city.
"and then I try to get all the rows which are the city type by:"
This way you can get all the rows that contains city in the column place:
df = df[(df['place'] == 'city')]