Related
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 6 days ago.
Improve this question
I'm using an API from anomali to gather intel list and i wanna ask on how i could run the code so that it would output all the need columns header into an excel file.
So i created a code where i pull out the needed columns to be implemented to the site.
import requests
import json
import pandas as pd
import csv
url = 'https://api.threatstream.com/api/v2/intelligence/?itype=bot_ip'
csv_columns = ['ip','source_created', 'status', 'itype', 'expiration_ts', 'is_editable', 'feed_id', 'update_id',
'value', 'ispublic', 'threat_type', 'workgroups', 'rdns', 'confidence', 'uuid', 'retina_confidence',
'trusted_circle_ids', 'id', 'source', 'owner_organization_id', 'import_session_id', 'source_modified',
'type', 'sort', 'description', 'tags', 'threatscore', 'latitude', 'modified_ts', 'org', 'asn',
'created_ts', 'tlp', 'is_anonymous', 'country', 'source_reported_confidence', 'can_add_public_tags',
'subtype', 'meta', 'resource_uri']
with open("AnomaliThreat.csv","a", newline='') as filecsv:
writer = csv.DictWriter(filecsv, fieldnames=csv_columns)
writer.writeheader()
headers = {
'Accept': 'application/json',
'Authorization': 'apikey testing:wdwfawaf12321rfewawafa'
}
response= requests.get( url=url,headers=headers)
json_Data = json.loads(response.content)
result = json_Data["objects"]
with open("AnomaliThreat.csv","a", newline='')as filecsv:
writer = csv.DictWriter(filecsv,fieldnames=csv_columns)
writer.writerow(result)
If i ran this code, all i got is 'list' no attribute keys, my guess is because inside the response, there's a list inside the list or another string inside the list for example like this
'trusted_circle_ids': [1241412, 212141241]
or this
'tags': [{'id': 'fwafwff', 'name': 'wfwafwawf'},
{'id': '31231ewfw',
'name': 'fwafwafwafaw#gmail.com.wafawfawfds.com'}],
And this is what's inside the response of anomali
[{'source_created': None,
'status': 'inactive',
'itype': 'bot_ip',
'expiration_ts': '',
'ip': '231.24124.1241.412',
'is_editable': False,
'feed_id': 23112231,
'update_id': 231231,
'value': '124124124141224141',
'is_public': False,
'threat_type': 'bot',
'workgroups': [],
'rdns': None,
'confidence': 12,
'uuid': '3123414124124142',
'retina_confidence': 52414,
'trusted_circle_ids': [1241412, 212141241],
'id': fwaffewaewafw1231231,
'source': 'wfawfwaefwadfwa',
'owner_organization_id': 2,
'import_session_id': None,
'source_modified': None,
'type': 'ip',
'sort': [312312424124141241, '1241414214241'],
'description': None,
'tags': [{'id': 'fwafwff', 'name': 'wfwafwawf'},
{'id': '31231ewfw',
'name': 'fwafwafwafaw#gmail.com.wafawfawfds.com'}],
'threatscore': 412,
'latitude': wafefwaf,
'modified_ts': 'wawafwadfd',
'org': 'fawfwafawe',
'asn': 'fwafwa2131231',
'created_ts': '41241241241241',
'tlp': None,
'is_anonymous': False,
'country': 'fwafw',
'source_reported_confidence': 21,
'can_add_public_tags': False,
'longitude': --321412,
'subtype': None,
'meta': {'detail2': 'bi2141412412342424',
'severity': '3123124r3'},
'resource_uri': '/api/v2/intelligence/241fsdfsf241325/'},
{'source_created': None,
'status': 'inactive',
'itype': 'bot_ip',
'expiration_ts': '',
'ip': '231.24124.1241.412',
'is_editable': False,
'feed_id': 23112231,
'update_id': 231231,
'value': '124124124141224141',
'is_public': False,
'threat_type': 'bot',
'workgroups': [],
'rdns': None,
'confidence': 12,
'uuid': '3123414124124142',
'retina_confidence': 52414,
'trusted_circle_ids': [1241412, 212141241],
'id': fwaffewaewafw1231231,
'source': 'wfawfwaefwadfwa',
'owner_organization_id': 2,
'import_session_id': None,
'source_modified': None,
'type': 'ip',
'sort': [312312424124141241, '1241414214241'],
'description': None,
'tags': [{'id': 'fwafwff', 'name': 'wfwafwawf'},
{'id': '31231ewfw',
'name': 'fwafwafwafaw#gmail.com.wafawfawfds.com'}],
'threatscore': 412,
'latitude': wafefwaf,
'modified_ts': 'wawafwadfd',
'org': 'fawfwafawe',
'asn': 'fwafwa2131231',
'created_ts': '41241241241241',
'tlp': None,
'is_anonymous': False,
'country': 'fwafw',
'source_reported_confidence': 21,
'can_add_public_tags': False,
'longitude': --321412,
'subtype': None,
'meta': {'detail2': 'bi2141412412342424',
'severity': '3123124r3'},
'resource_uri': '/api/v2/intelligence/241fsdfsf241325/'}]
I'm open to any suggestions on how to make it so that the results can be inputed into an excel file
Problem Solved!
I needed to add a value to the code, so i added this line
csv_writer = csv.writer(data_file)
count = 0
for res in result:
if count == 0:
header = res.keys()
csv_writer.writerow(header)
count += 1
csv_writer.writerow(res.values())
data_file.close()
You can try doing something like this if i understood correctly,
import requests
import json
import pandas as pd
import csv
url = 'https://api.threatstream.com/api/v2/intelligence/?itype=bot_ip'
csv_columns = ['ip','source_created', 'status', 'itype', 'expiration_ts', 'is_editable', 'feed_id', 'update_id',
'value', 'ispublic', 'threat_type', 'workgroups', 'rdns', 'confidence', 'uuid', 'retina_confidence',
'trusted_circle_ids', 'id', 'source', 'owner_organization_id', 'import_session_id', 'source_modified',
'type', 'sort', 'description', 'tags', 'threatscore', 'latitude', 'modified_ts', 'org', 'asn',
'created_ts', 'tlp', 'is_anonymous', 'country', 'source_reported_confidence', 'can_add_public_tags',
'subtype', 'meta', 'resource_uri']
headers = {
'Accept': 'application/json',
'Authorization': 'apikey testing:wdwfawaf12321rfewawafa'
}
response= requests.get( url=url,headers=headers)
json_Data = json.loads(response.content)
result = json_Data["objects"]
dataframe_1 = pd.Dataframe
for key, value in result.items():
if key in csv_columns:
dataframe_1[key] = value
dataframe_1.to_csv("AnomaliThreat.csv")
something along those lines, so basically iterate through the key, value pairs with in the result, check if the key is in the csv_columns, save that key value pair, finally once all that is done just use the dataframe.to_csv
I want to be able to GET information from API 1 and match it with API 2 and be able to update API 2's information with API 1. I am trying to figure out the most efficient/automated way to accomplish this as it also needs to be updated at a interval of every 10 minutes
I can query and get the results from API 1 this is my code and what my code looks like.
import json
import requests
myToken = '52c32f6588004cb3ab33b0ff320b8e4f'
myUrl = 'https://api1.com/api/v1/devices.json'
head = {'Authorization': 'Token {}'.format(myToken)}
response = requests.get(myUrl, headers=head)
r = json.loads(response.content)
r
The payload looks like this from API 1
{ "device" : {
"id": 153,
"battery_status" : 61,
"serial_no": "5QBYGKUI05",
"location_lat": "-45.948917",
"location_lng": "29.832179",
"location_address": "800 Laurel Rd, Lansdale, PA 192522,USA"}
}
I want to be able to take this information and match by "serial_no" and update all the other pieces of information for the corresponding device in API 2
I query the data for API 2 and this is what my code looks like
params = {
"location":'cf6707e3-f0ae-4040-a184-737b21a4bbd1',
"dateAdded":'ge:11/23/2020'}
url = requests.get('https://api2.com/api/assets',auth=('api2', '123456'), params=params)
r = json.loads(url.content)
r['items']
The JSON payload looks like this
[{'id': '064ca857-3783-460e-a7a2-245e054dcbe3',
'name': 'Apple Laptop 1',
'model': {'id': '50f5993e-2abf-49c8-86e0-8743dd58db6f',
'name': 'MacBook Pro'},
'manufacturer': {'id': 'f56244e2-76e3-46da-97dd-f72f92ca0779',
'name': 'APPLE'},
'room': {'id': '700ff2dc-0118-46c6-936a-01f0fa88c620',
'name': 'Storage Room 1',
'thirdPartyId': ''},
'location': {'id': 'cf6707e3-f0ae-4040-a184-737b21a4bbd1',
'name': 'Iron Mountain',
'thirdPartyId': ''},
'position': 'NonMounted',
'containerAsset': {'id': '00000000-0000-0000-0000-000000000000',
'name': None},
'baseAsset': {'id': '064ca857-3783-460e-a7a2-245e054dcbe3',
'name': 'Apple Laptop 1'},
'description': None,
'status': {'id': 'df9906d8-2856-45e3-9cba-bd7a1ac4971f',
'name': 'Production'},
'serialNumber': '5QBYGKUI06',
'tagNumber': None,
'alternateTagNumber': None,
'verificationStatus': {'id': 'cb3560a9-eef5-47b9-b033-394d3a09db18',
'name': 'Verified'},
'requiresRFID': False,
'requiresHangTag': False,
'bottomPosition': 0.0,
'leftPosition': 0.0,
'rackPosition': 'Front',
'labelX': None,
'labelY': None,
'verifyNameInRear': False,
'verifySerialNumberInRear': False,
'verifyBarcodeInRear': False,
'isNonDataCenter': False,
'rotate': False,
'customer': {'id': '00000000-0000-0000-0000-000000000000', 'name': None},
'thirdPartyId': '',
'temperature': None,
'dateLastScanned': None,
'placement': 'Floor',
'lastScannedLabelX': None,
'lastScannedLabelY': None,
'userDefinedValues': [{'userDefinedKeyId': '79e77a1e-4030-4308-a8ff-9caf40c04fbd',
'userDefinedKeyName': 'Longitude ',
'value': '-75.208917'},
{'userDefinedKeyId': '72c8056e-9b7d-40ac-9270-9f5929097e82',
'userDefinedKeyName': 'Address',
'value': '800 Laurel Rd, New York ,NY 19050, USA'},
{'userDefinedKeyId': '31aeeb91-daef-4364-8dd6-b0e3436d6a51',
'userDefinedKeyName': 'Battery Level',
'value': '67'},
{'userDefinedKeyId': '22b7ce4f-7d3d-4282-9ecb-e8ec2238acf2',
'userDefinedKeyName': 'Latitude',
'value': '35.932179'}]}
The documentation provided by API 2 tells me they only support PUT for updates as of right now but I would also want to know how I would do this using PATCH as it will be available in the future. So the data payload that I need to successful PUT is this
payload = {'id': '064ca857-3783-460e-a7a2-245e054dcbe3',
'name': 'Apple Laptop 1',
'model': {'id': '50f5993e-2abf-49c8-86e0-8743dd58db6f',
'name': 'MacBook Pro'},
'manufacturer': {'id': 'f56244e2-76e3-46da-97dd-f72f92ca0779',
'name': 'APPLE'},
'room': {'id': '700ff2dc-0118-46c6-936a-01f0fa88c620',
'name': 'Storage Room 1',
'thirdPartyId': ''},
'status': {'id': 'df9906d8-2856-45e3-9cba-bd7a1ac4971f',
'name': 'Production'},
'serialNumber': '5QBYGKUI06',
'verificationStatus': {'id': 'cb3560a9-eef5-47b9-b033-394d3a09db18',
'name': 'Verified'},
'requiresRFID': 'False',
'requiresHangTag': 'False',
'userDefinedValues': [{'userDefinedKeyId': '79e77a1e-4030-4308-a8ff-9caf40c04fbd',
'userDefinedKeyName': 'Longitude ',
'value': '-75.248920'},
{'userDefinedKeyId': '72c8056e-9b7d-40ac-9270-9f5929097e82',
'userDefinedKeyName': 'Address',
'value': '801 Laurel Rd, New York, Ny 192250, USA'},
{'userDefinedKeyId': '31aeeb91-daef-4364-8dd6-b0e3436d6a51',
'userDefinedKeyName': 'Battery Level',
'value': '67'},
{'userDefinedKeyId': '22b7ce4f-7d3d-4282-9ecb-e8ec2238acf2',
'userDefinedKeyName': 'Latitude',
'value': '29.782177'}]}
So apart of this is figuring out how I can query the json data portions that I need for the update
I am able to update the information using this line
requests.put('https://api2.com/api/assets/064ca857-3783-460e-a7a2-245e054dcbe3',auth=('API2', '123456'), data=json.dumps(payload))
but I need for it to dynamically update so I don't think the hard coded id parameter in the line will be efficient in a automation/efficiency standpoint. If anybody has any ideas, resources to point me in the right direction to know more about this process (I don't really know what it is even called) would be greatly appreciated.
Not entirely sure what you are trying to do here, but if you want to pull information nested in the responses you can do this.
Serial number from API 1
r['device']['serial_no']
Serial number for API 2
either r[0]['serialNumber'] or r['items'][0]['serialNumber'] depending on what you are showing
To modify the payload serial number, for example
payload['serialNumber'] = '123456abcdef'
This is my first time dealing with json data. So I'm not that familiar with the structure of json.
I got some data through "we the people" e-petition sites with following code:
url = "https://api.whitehouse.gov/v1/petitions.json?limit=3&offset=0&createdBefore=1573862400"
jdata_2 = requests.get(url).json()
Yet, I realize this is something different from... the ordinary json structure since I got some error while I tried to convert it into excel file with pandas
df = pandas.read_json(jdata_2)
Obviously, I must miss something which I must have done before using pandas.read_json() code.
I have searched for the answer but most of questions are "How can I convert json data into excel data", which needs json data. For my case, I scraped it from the url, so I thought I could make that strings into json data, and then try to convert it into excel data as well. So I tried to use json.dump() as well, but it didn't work as well.
I know it must be the naive question. But I'm not sure where I can start with this naive question. If anyone can instruct me how to deal with it, I would really appreciate it. Or link me some references that I can study as well.
Thank you for your help in advance.
This is the json data with the requests, and I pprint it with indent=4.
Input:
url = "https://api.whitehouse.gov/v1/petitions.json?limit=3&offset=0&createdBefore=1573862400"
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(jdata_2)
Output :
{ 'metadata': { 'requestInfo': { 'apiVersion': 1,
'query': { 'body': None,
'createdAfter': None,
'createdAt': None,
'createdBefore': '1573862400',
'isPublic': 1,
'isSignable': None,
'limit': '3',
'mock': 0,
'offset': '0',
'petitionsDefaultLimit': '1000',
'publicThreshold': 149,
'responseId': None,
'signatureCount': None,
'signatureCountCeiling': None,
'signatureCountFloor': 0,
'signatureThreshold': None,
'signatureThresholdCeiling': None,
'signatureThresholdFloor': None,
'sortBy': 'DATE_REACHED_PUBLIC',
'sortOrder': 'ASC',
'status': None,
'title': None,
'url': None,
'websiteUrl': 'https://petitions.whitehouse.gov'},
'resource': 'petitions'},
'responseInfo': { 'developerMessage': 'OK',
'errorCode': '',
'moreInfo': '',
'status': 200,
'userMessage': ''},
'resultset': {'count': 1852, 'limit': 3, 'offset': 0}},
'results': [ { 'body': 'Please save kurdish people in syria \r\n'
'pleaee save north syria',
'created': 1570630389,
'deadline': 1573225989,
'id': '2798897',
'isPublic': True,
'isSignable': False,
'issues': [ { 'id': 326,
'name': 'Homeland Security & '
'Defense'}],
'petition_type': [ { 'id': 291,
'name': 'Call on Congress to '
'act on an issue'}],
'reachedPublic': 0,
'response': [],
'signatureCount': 149,
'signatureThreshold': 100000,
'signaturesNeeded': 99851,
'status': 'closed',
'title': 'Please save rojava north syria\r\n'
'please save kurdish people\r\n'
'please stop erdogan\r\n'
'plaease please',
'type': 'petition',
'url': 'https://petitions.whitehouse.gov/petition/please-save-rojava-north-syria-please-save-kurdish-people-please-stop-erdogan-plaease-please'},
{ 'body': 'Kane Friess was a 2 year old boy who was '
"murdered by his mom's boyfriend, Gyasi "
'Campbell. Even with expert statements from '
'forensic anthropologists, stating his injuries '
'wete the result of homicide. Mr. Campbell was '
'found guilty of involuntary manslaughter. This '
"is an outrage to Kane's Family and our "
'community.',
'created': 1566053365,
'deadline': 1568645365,
'id': '2782248',
'isPublic': True,
'isSignable': False,
'issues': [ { 'id': 321,
'name': 'Criminal Justice Reform'}],
'petition_type': [ { 'id': 281,
'name': 'Change an existing '
'Administration '
'policy'}],
'reachedPublic': 0,
'response': [],
'signatureCount': 149,
'signatureThreshold': 100000,
'signaturesNeeded': 99851,
'status': 'closed',
'title': "Kane's Law. Upon which the murder of a child, "
'regardless of circumstances, be seen as 1st '
'degree murder. A Federal Law.',
'type': 'petition',
'url': 'https://petitions.whitehouse.gov/petition/kanes-law-upon-which-murder-child-regardless-circumstances-be-seen-1st-degree-murder-federal-law'},
{ 'body': "Schumer and Pelosi's hatred and refusing to "
'work with President Donald J. Trump is holding '
'America hostage. We the people know securing '
'our southern border is a priority which will '
'not happen with these two in office. Lets '
'build the wall NOW!',
'created': 1547050064,
'deadline': 1549642064,
'id': '2722358',
'isPublic': True,
'isSignable': False,
'issues': [ {'id': 306, 'name': 'Budget & Taxes'},
{ 'id': 326,
'name': 'Homeland Security & '
'Defense'},
{'id': 29, 'name': 'Immigration'}],
'petition_type': [ { 'id': 291,
'name': 'Call on Congress to '
'act on an issue'}],
'reachedPublic': 0,
'response': [],
'signatureCount': 149,
'signatureThreshold': 100000,
'signaturesNeeded': 99851,
'status': 'closed',
'title': 'Remove Chuck Schumer and Nancy Pelosi from '
'office',
'type': 'petition',
'url': 'https://petitions.whitehouse.gov/petition/remove-chuck-schumer-and-nancy-pelosi-office'}]}
And this is the Error message I got
Input :
df = pandas.read_json(jdata_2)
Output :
ValueError: Invalid file path or buffer object type: <class 'dict'>
You can try the below code as well, it is working fine
URL = "https://api.whitehouse.gov/v1/petitions.json?limit=3&offset=0&createdBefore=1573862400"
// fetching the json response from the URL
req = requests.get(URL)
text_data= req.text
json_dict= json.loads(text_data)
//converting json dictionary to python dataframe for results object
df = pd.DataFrame.from_dict(json_dict["results"])
Finally, saving the dataframe to excel format i.e xlsx
df.to_excel("output.xlsx")
I found myself not understanding how I can select only some elements of my Steam API request response.
Here is the code with the results that makes a correct request on Steam. It is non-reproducable because client_id is personal information. The results are included.
# All online streamers
client_id = "...confidential"
limit = "2"
def request_dataNewAPI(limit):
headers = {"Client-ID": client_id, "Accept": "application/vnd.twitchtv.v5+json"}
url = "https://api.twitch.tv/helix/streams?first=" + limit
r = requests.get(url, headers=headers).json()
return r
# If a bad user login name or offline response will be:
# {'data': [], 'pagination': {}}
table1 = request_dataNewAPI(limit)
The output is:
New API
{'data': [{'id': '34472839600', 'user_id': '12826', 'user_name': 'Twitch', 'game_id': '509663', 'community_ids': ['f261cf73-cbcc-4b08-af72-c6d2020f9ed4'], 'type': 'live', 'title': 'The 1st Ever 3rd or 4th Pre Pre Show! Part 6', 'viewer_count': 19555, 'started_at': '2019-06-10T02:01:20Z', 'language': 'en', 'thumbnail_url': 'https://static-cdn.jtvnw.net/previews-ttv/live_user_twitch-{width}x{height}.jpg', 'tag_ids': ['d27da25e-1ee2-4207-bb11-dd8d54fa29ec', '6ea6bca4-4712-4ab9-a906-e3336a9d8039']}, {'id': '34474693232', 'user_id': '39298218', 'user_name': 'dakotaz', 'game_id': '33214', 'community_ids': [], 'type': 'live', 'title': '𝙖𝙙𝙫𝙚𝙣𝙩𝙪𝙧𝙚 𝙩𝙞𝙢𝙚 | code: dakotaz in itemshop & GFUEL', 'viewer_count': 15300, 'started_at': '2019-06-10T06:37:02Z', 'language': 'en', 'thumbnail_url': 'https://static-cdn.jtvnw.net/previews-ttv/live_user_dakotaz-{width}x{height}.jpg', 'tag_ids': ['6ea6bca4-4712-4ab9-a906-e3336a9d8039']}], 'pagination': {'cursor': 'eyJiIjpudWxsLCJhIjp7Ik9mZnNldCI6Mn19'}}
The problem is that I want to select only the list of 'user_name' of active streamers. I tried the following:
print(table1['data']['user_name'])
gives "TypeError: list indices must be integers or slices, not str".
print(table1['data'])
gives the whole array of data:
[{'id': '34472839600', 'user_id': '12826', 'user_name': 'Twitch', 'game_id': '509663', 'community_ids': ['f261cf73-cbcc-4b08-af72-c6d2020f9ed4'], 'type': 'live', 'title': 'The 1st Ever 3rd or 4th Pre Pre Show! Part 6', 'viewer_count': 19555, 'started_at': '2019-06-10T02:01:20Z', 'language': 'en', 'thumbnail_url': 'https://static-cdn.jtvnw.net/previews-ttv/live_user_twitch-{width}x{height}.jpg', 'tag_ids': ['d27da25e-1ee2-4207-bb11-dd8d54fa29ec', '6ea6bca4-4712-4ab9-a906-e3336a9d8039']}, {'id': '34474693232', 'user_id': '39298218', 'user_name': 'dakotaz', 'game_id': '33214', 'community_ids': [], 'type': 'live', 'title': '𝙖𝙙𝙫𝙚𝙣𝙩𝙪𝙧𝙚 𝙩𝙞𝙢𝙚 | code: dakotaz in itemshop & GFUEL', 'viewer_count': 15300, 'started_at': '2019-06-10T06:37:02Z', 'language': 'en', 'thumbnail_url': 'https://static-cdn.jtvnw.net/previews-ttv/live_user_dakotaz-{width}x{height}.jpg', 'tag_ids': ['6ea6bca4-4712-4ab9-a906-e3336a9d8039']}]
As a final result, I would like to have something like:
'user_name': {name1, name2}
The problem is that I want to select only the list of 'user_name' of active streamers
... print(table1['data']['user_name'])
... gives "TypeError: list indices must be integers or slices, not str".
You receive TypeError because table1['data'] is a list, not dict and you must access its members with int, not str (although dict keys can be int as well).
Use list comprehension:
user_names = [x['user_name'] for x in table1['data']]
This will give you the list of strings representing user names.
Here is the scirpt:
from bs4 import BeautifulSoup as bs4
import requests
import json
from lxml import html
from pprint import pprint
import re
def get_data():
url = 'https://sports.bovada.lv//baseball/mlb/game-lines-market-group'
r = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.103 Safari/537.36"})
html_bytes = r.text
soup = bs4(html_bytes, 'lxml')
# res = soup.findAll('script') # find all scripts..
pattern = re.compile(r"swc_market_lists\s+=\s+(\{.*?\})")
script = soup.find("script", text=pattern)
return script.text[23:]
test1 = get_data()
data = json.loads(test1)
for item1 in data['items']:
data1 = item1['itemList']['items']
for item2 in data1:
pitch_a = item2['opponentAName']
pitch_b = item2['opponentBName']
## group = item2['displayGroups']
## for item3 in group:
## new_il = item3['itemList']
## for item4 in new_il:
## market = item4['description']
## oc = item4['outcomes']
print(pitch_a,pitch_b)
##for items in data['items']:
## pos = items['itemList']['items']
## for item in pos:
## work = item['competitors']
## pitcher_a = item['opponentAName']
## pitcher_b = item['opponentBName']
## group = item['displayGroups']
## for item, item2 in zip(work,group):
## team = item['abbreviation']
## place = item['type']
## il2 = item2['itemList']
## for item in il2:
## ml = item['description']
## print(team,place,pitcher_a,pitcher_b,ml)
I have been trying to scrape
team abbrev = ['items']['itemList']['items']['competitors']['abbreviation']
home_away = ['items']['itemList']['items']['competitors']['type']
team pitcher home = ['items']['itemList']['items']['opponentAName']
team pitcher away = ['items']['itemList']['items']['opponentBName']
moneyline american odds = ['items']['itemList']['items']['displayGroups']['itemList']['outcomes']['price']['american']
Total runs = ['items']['itemList']['items']['displayGroups']['itemList']['outcomes']['price']['handicap']
Part of the Json pprinted:
[{'baseLink': '/baseball/mlb/game-lines-market-group',
'defaultType': True,
'description': 'Game Lines',
'id': '136',
'itemList': {'items': [{'LIVE': True,
'atmosphereLink': '/api/atmosphere/eventNotification/events/A/3149961',
'awayTeamFirst': True,
'baseLink': '/baseball/mlb/minnesota-twins-los-angeles-angels-201805112207',
'competitionId': '24736',
'competitors': [{'abbreviation': 'LAA',
'description': 'Los Angeles Angels',
'id': '3149961-1642',
'rotationNumber': '978',
'shortName': 'Angels',
'type': 'HOME'},
{'abbreviation': 'MIN',
'description': 'Minnesota Twins',
'id': '3149961-9990',
'rotationNumber': '977',
'shortName': 'Twins',
'type': 'AWAY'}],
'denySameGame': 'NO',
'description': 'Minnesota Twins # Los Angeles Angels',
'displayGroups': [{'baseLink': '/baseball/mlb/game-lines-market-group',
'defaultType': True,
'description': 'Game Lines',
'id': '136',
'itemList': [{'belongsToDefault': True,
'columns': 'H2Columns',
'description': 'Moneyline',
'displayGroups': '136,A-136',
'id': '46892277',
'isInRunning': True,
'mainMarketType': 'MONEYLINE',
'mainPeriod': True,
'marketTypeGroup': 'MONEY_LINE',
'notes': '',
'outcomes': [{'competitorId': '3149961-9990',
'description': 'Minnesota '
'Twins',
'id': '211933276',
'price': {'american': '-475',
'decimal': '1.210526',
'fractional': '4/19',
'id': '1033002124',
'outcomeId': '211933276'},
'status': 'OPEN',
'type': 'A'},
{'competitorId': '3149961-1642',
'description': 'Los '
'Angeles '
'Angels',
'id': '211933277',
'price': {'american': '+310',
'decimal': '4.100',
'fractional': '31/10',
'id': '1033005679',
'outcomeId': '211933277'},
'status': 'OPEN',
'type': 'H'}],
'periodType': 'Live '
'Match',
'sequence': '14',
'sportCode': 'BASE',
'status': 'OPEN',
'type': 'WW'},
{'belongsToDefault': True,
'columns': 'H2Columns',
'description': 'Runline',
'displayGroups': '136,A-136',
'id': '46892287',
'isInRunning': True,
'mainMarketType': 'SPREAD',
'mainPeriod': True,
'marketTypeGroup': 'SPREAD',
'notes': '',
'outcomes': [{'competitorId': '3149961-9990',
'description': 'Minnesota '
'Twins',
'id': '211933278',
'price': {'american': '+800',
'decimal': '9.00',
'fractional': '8/1',
'handicap': '-1.5',
'id': '1033005677',
'outcomeId': '211933278'},
'status': 'OPEN',
'type': 'A'},
{'competitorId': '3149961-1642',
'description': 'Los '
'Angeles '
'Angels',
'id': '211933279',
'price': {'american': '-2000',
'decimal': '1.050',
'fractional': '1/20',
'handicap': '1.5',
'id': '1033005678',
'outcomeId': '211933279'},
'status': 'OPEN',
'type': 'H'}],
'periodType': 'Live '
'Match',
'sequence': '14',
'sportCode': 'BASE',
'status': 'OPEN',
'type': 'SPR'}],
'link': '/baseball/mlb/game-lines-market-group'}],
'feedCode': '13625145',
'id': '3149961',
'link': '/baseball/mlb/minnesota-twins-los-angeles-angels-201805112207',
'notes': '',
'numMarkets': 2,
'opponentAId': '214704',
'opponentAName': 'Tyler Skaggs (L)',
'opponentBId': '215550',
'opponentBName': 'Lance Lynn (R)',
'sport': 'BASE',
'startTime': 1526090820000,
'status': 'O',
'type': 'MLB'},
There are a few different loops I had started in the script above but either of them are working out the way I would like.
away team | away moneyline | away pitcher | Total Runs | and repeat for Home Team is what I would like it to be eventually. I can write to csv once it is parsed the proper way.
Thank you for the fresh set of eyes, I've been working on this for the better part of a day trying to figure out the best way to access the content I would like. If Json is not the best way and bs4 works better I would love to hear your opinion
There's no simple answer to your problem. Scraping data requires you to carefully assess the data you are dealing with, work out where the parts you want to extract are located and figure out how to effectively store the data you extract.
Try printing the data in your loops to visualise what is happening in your code (or try debugging). From there its easy to figure out it if you're iterating over what you expect. Look for patterns throughout the input data to help organise the data you extract.
To help yourself, you should give your variables descriptive names, separate your code into logical chunks and add comments when it starts to get complicated.
Here's some working code, but I encourage you to try what I told you above, then if you're still stuck look below for guidance.
output = {}
root = data['items'][0]
for game_line in root['itemList']['items']:
# Create a temporary dict to store the data for this gameline
team_data = {}
# Get competitors
competitors = game_line['competitors']
for team in competitors:
team_type = team['type'] # either HOME or AWAY
# Create a new dict to store data for each team
team_data[team_type] = {}
team_data[team_type]['abbreviation'] = team['abbreviation']
team_data[team_type]['name'] = team['description']
# Get MoneyLine and Total Runs
for item in game_line['displayGroups'][0]['itemList']:
for outcome in item['outcomes']:
team_type = outcome['type'] # either A or H
team_type = 'HOME' if team_type == 'H' else 'AWAY'
if item['mainMarketType'] == 'MONEYLINE':
team_data[team_type]['moneyline'] = outcome['price']['american']
elif item['mainMarketType'] == 'SPREAD':
team_data[team_type]['total runs'] = outcome['price']['handicap']
# Get the pitchers
team_data['HOME']['pitcher'] = game_line['opponentAName']
team_data['AWAY']['pitcher'] = game_line['opponentBName']
# For each gameline, add the teamdata we gathered to the output dict
output[game_line['description']] = team_data
This produces like:
{
'Atlanta Braves # Miami Marlins': {
'AWAY': {
'abbreviation': 'ATL',
'moneyline': '-130',
'name': 'Atlanta Braves',
'pitcher': 'Mike Soroka (R)',
'total runs': '-1.5'
},
'HOME': {
'abbreviation': 'MIA',
'moneyline': '+110',
'name': 'Miami Marlins',
'pitcher': 'Jarlin Garcia (L)',
'total runs': '1.5'
}
},
'Boston Red Sox # Toronto Blue Jays': {
'AWAY': {
'abbreviation': 'BOS',
'moneyline': '-133',
'name': 'Boston Red Sox',
'pitcher': 'David Price (L)',
'total runs': '-1.5'
},
'HOME': {
'abbreviation': 'TOR',
'moneyline': '+113',
'name': 'Toronto Blue Jays',
'pitcher': 'Marco Estrada (R)',
'total runs': '1.5'
}
},
}