Scraping a particular dictionary format to a dataframe in python

Scraping a particular dictionary format to a dataframe in python - python

I have a dictionary structure which looks something like this:
data = [{'organization': {'id': 14, 'description': 'France'},
'distribution': True,
'forAttention': True},
{'organization': {'id': 38, 'description': 'Netherlands'},
'distribution': True,
'forAttention': True},
{'organization': {'id': 31, 'description': 'Poland'},
'notifying': True,
'origin': True},
{'organization': {'id': 17, 'description': 'United Kingdom'},
'distribution': True}]
(Please note this is just 1 example with 4 organizations, but this number can vary)
I want to put this information for the organizations in a single row of a dataframe something like this:
>> df
Origin_ct Notifying_ct Distribution_ct ForAttention_ct
Poland Poland France, Netherlands, France, Netherlands
United Kingdom
The idea is for every organization - check if it has a True value and put it in the respective column for the dataframe. How do I do this?

Try this:
import pandas as pd
df = pd.DataFrame(columns=['Origin_ct','Notifying_ct','Distribution_ct','ForAttention_ct'])
origin_ct = []
notifying_ct = []
distribution_ct = []
forattention_ct = []
for organization in data:
country = organization['organization']['description']
if 'origin' in organization.keys() and organization['origin']:
origin_ct.append(country)
if 'notifying' in organization.keys() and organization['notifying']:
notifying_ct.append(country)
if 'distribution' in organization.keys() and organization['distribution']:
distribution_ct.append(country)
if 'forAttention' in organization.keys() and organization['forAttention']:
forattention_ct.append(country)

Related

How to Match two APIs to update one API dataset using Python

I want to be able to GET information from API 1 and match it with API 2 and be able to update API 2's information with API 1. I am trying to figure out the most efficient/automated way to accomplish this as it also needs to be updated at a interval of every 10 minutes
I can query and get the results from API 1 this is my code and what my code looks like.
import json
import requests
myToken = '52c32f6588004cb3ab33b0ff320b8e4f'
myUrl = 'https://api1.com/api/v1/devices.json'
head = {'Authorization': 'Token {}'.format(myToken)}
response = requests.get(myUrl, headers=head)
r = json.loads(response.content)
r
The payload looks like this from API 1
{ "device" : {
"id": 153,
"battery_status" : 61,
"serial_no": "5QBYGKUI05",
"location_lat": "-45.948917",
"location_lng": "29.832179",
"location_address": "800 Laurel Rd, Lansdale, PA 192522,USA"}
}
I want to be able to take this information and match by "serial_no" and update all the other pieces of information for the corresponding device in API 2
I query the data for API 2 and this is what my code looks like
params = {
"location":'cf6707e3-f0ae-4040-a184-737b21a4bbd1',
"dateAdded":'ge:11/23/2020'}
url = requests.get('https://api2.com/api/assets',auth=('api2', '123456'), params=params)
r = json.loads(url.content)
r['items']
The JSON payload looks like this
[{'id': '064ca857-3783-460e-a7a2-245e054dcbe3',
'name': 'Apple Laptop 1',
'model': {'id': '50f5993e-2abf-49c8-86e0-8743dd58db6f',
'name': 'MacBook Pro'},
'manufacturer': {'id': 'f56244e2-76e3-46da-97dd-f72f92ca0779',
'name': 'APPLE'},
'room': {'id': '700ff2dc-0118-46c6-936a-01f0fa88c620',
'name': 'Storage Room 1',
'thirdPartyId': ''},
'location': {'id': 'cf6707e3-f0ae-4040-a184-737b21a4bbd1',
'name': 'Iron Mountain',
'thirdPartyId': ''},
'position': 'NonMounted',
'containerAsset': {'id': '00000000-0000-0000-0000-000000000000',
'name': None},
'baseAsset': {'id': '064ca857-3783-460e-a7a2-245e054dcbe3',
'name': 'Apple Laptop 1'},
'description': None,
'status': {'id': 'df9906d8-2856-45e3-9cba-bd7a1ac4971f',
'name': 'Production'},
'serialNumber': '5QBYGKUI06',
'tagNumber': None,
'alternateTagNumber': None,
'verificationStatus': {'id': 'cb3560a9-eef5-47b9-b033-394d3a09db18',
'name': 'Verified'},
'requiresRFID': False,
'requiresHangTag': False,
'bottomPosition': 0.0,
'leftPosition': 0.0,
'rackPosition': 'Front',
'labelX': None,
'labelY': None,
'verifyNameInRear': False,
'verifySerialNumberInRear': False,
'verifyBarcodeInRear': False,
'isNonDataCenter': False,
'rotate': False,
'customer': {'id': '00000000-0000-0000-0000-000000000000', 'name': None},
'thirdPartyId': '',
'temperature': None,
'dateLastScanned': None,
'placement': 'Floor',
'lastScannedLabelX': None,
'lastScannedLabelY': None,
'userDefinedValues': [{'userDefinedKeyId': '79e77a1e-4030-4308-a8ff-9caf40c04fbd',
'userDefinedKeyName': 'Longitude ',
'value': '-75.208917'},
{'userDefinedKeyId': '72c8056e-9b7d-40ac-9270-9f5929097e82',
'userDefinedKeyName': 'Address',
'value': '800 Laurel Rd, New York ,NY 19050, USA'},
{'userDefinedKeyId': '31aeeb91-daef-4364-8dd6-b0e3436d6a51',
'userDefinedKeyName': 'Battery Level',
'value': '67'},
{'userDefinedKeyId': '22b7ce4f-7d3d-4282-9ecb-e8ec2238acf2',
'userDefinedKeyName': 'Latitude',
'value': '35.932179'}]}
The documentation provided by API 2 tells me they only support PUT for updates as of right now but I would also want to know how I would do this using PATCH as it will be available in the future. So the data payload that I need to successful PUT is this
payload = {'id': '064ca857-3783-460e-a7a2-245e054dcbe3',
'name': 'Apple Laptop 1',
'model': {'id': '50f5993e-2abf-49c8-86e0-8743dd58db6f',
'name': 'MacBook Pro'},
'manufacturer': {'id': 'f56244e2-76e3-46da-97dd-f72f92ca0779',
'name': 'APPLE'},
'room': {'id': '700ff2dc-0118-46c6-936a-01f0fa88c620',
'name': 'Storage Room 1',
'thirdPartyId': ''},
'status': {'id': 'df9906d8-2856-45e3-9cba-bd7a1ac4971f',
'name': 'Production'},
'serialNumber': '5QBYGKUI06',
'verificationStatus': {'id': 'cb3560a9-eef5-47b9-b033-394d3a09db18',
'name': 'Verified'},
'requiresRFID': 'False',
'requiresHangTag': 'False',
'userDefinedValues': [{'userDefinedKeyId': '79e77a1e-4030-4308-a8ff-9caf40c04fbd',
'userDefinedKeyName': 'Longitude ',
'value': '-75.248920'},
{'userDefinedKeyId': '72c8056e-9b7d-40ac-9270-9f5929097e82',
'userDefinedKeyName': 'Address',
'value': '801 Laurel Rd, New York, Ny 192250, USA'},
{'userDefinedKeyId': '31aeeb91-daef-4364-8dd6-b0e3436d6a51',
'userDefinedKeyName': 'Battery Level',
'value': '67'},
{'userDefinedKeyId': '22b7ce4f-7d3d-4282-9ecb-e8ec2238acf2',
'userDefinedKeyName': 'Latitude',
'value': '29.782177'}]}
So apart of this is figuring out how I can query the json data portions that I need for the update
I am able to update the information using this line
requests.put('https://api2.com/api/assets/064ca857-3783-460e-a7a2-245e054dcbe3',auth=('API2', '123456'), data=json.dumps(payload))
but I need for it to dynamically update so I don't think the hard coded id parameter in the line will be efficient in a automation/efficiency standpoint. If anybody has any ideas, resources to point me in the right direction to know more about this process (I don't really know what it is even called) would be greatly appreciated.

Not entirely sure what you are trying to do here, but if you want to pull information nested in the responses you can do this.
Serial number from API 1
r['device']['serial_no']
Serial number for API 2
either r[0]['serialNumber'] or r['items'][0]['serialNumber'] depending on what you are showing
To modify the payload serial number, for example
payload['serialNumber'] = '123456abcdef'

how to get a list of stock tickers by entering a sector name

I am trying to write a code to return a list of stock tickers when entering a sector name.
for example, MSFT is in the technology sector in yfinance and I want the remaining companies that belong to this particular sector.
import yfinance as yf
msft= yf.Ticker("MSFT")
print(msft.info['sector'])
this code will return "Technology", how to get a dataframe
that contains other stocks.
and is it possible to get a more specific sector category such as "Communication" which is more specific than "technology"?

That data can be retrieved pretty easily with a package called yahooquery. Disclaimer: I am the author of the package.
To get stocks in the technology sector, you can do the following:
from yahooquery import Screener
s = Screener()
# data is a dictionary containing the keys passed to the function
data = s.get_screeners('ms_technology', count=25)
# the majority of the data will be in the quotes key
data['ms_technology']['quotes'][0]
{'language': 'en-US', 'region': 'US', 'quoteType': 'EQUITY', 'quoteSourceName': 'Delayed Quote', 'triggerable': True, 'currency': 'USD', 'priceHint': 2, 'longName': 'Apple Inc.', 'financialCurrency': 'USD', 'regularMarketOpen': 123.75, 'averageDailyVolume3Month': 106246233, 'averageDailyVolume10Day': 137149760, 'fiftyTwoWeekLowChange': 74.6375, 'fiftyTwoWeekLowChangePercent': 1.4042143, 'fiftyTwoWeekRange': '53.1525 - 145.09', 'fiftyTwoWeekHighChange': -17.299995, 'fiftyTwoWeekHighChangePercent': -0.119236305, 'fiftyTwoWeekLow': 53.1525, 'fiftyTwoWeekHigh': 145.09, 'dividendDate': 1613001600, 'earningsTimestamp': 1611765000, 'earningsTimestampStart': 1619607540, 'earningsTimestampEnd': 1620043200, 'trailingAnnualDividendRate': 0.807, 'trailingPE': 34.659615, 'trailingAnnualDividendYield': 0.0066551208, 'marketState': 'POSTPOST', 'epsTrailingTwelveMonths': 3.687, 'epsForward': 4.68, 'epsCurrentYear': 4.45, 'priceEpsCurrentYear': 28.716856, 'sharesOutstanding': 16788100096, 'bookValue': 3.936, 'fiftyDayAverage': 132.6306, 'fiftyDayAverageChange': -4.840599, 'fiftyDayAverageChangePercent': -0.036496848, 'twoHundredDayAverage': 122.9772, 'twoHundredDayAverageChange': 4.8127975, 'twoHundredDayAverageChangePercent': 0.039135687, 'marketCap': 2145351368704, 'forwardPE': 27.305557, 'priceToBook': 32.466972, 'sourceInterval': 15, 'exchangeDataDelayedBy': 0, 'exchangeTimezoneName': 'America/New_York', 'exchangeTimezoneShortName': 'EST', 'gmtOffSetMilliseconds': -18000000, 'esgPopulated': False, 'tradeable': True, 'firstTradeDateMilliseconds': 345479400000, 'postMarketChangePercent': 0.7434107, 'postMarketTime': 1614646799, 'postMarketPrice': 128.74, 'postMarketChange': 0.9500046, 'regularMarketChange': 6.529999, 'regularMarketTime': 1614632402, 'regularMarketPrice': 127.79, 'regularMarketDayHigh': 127.93, 'regularMarketDayRange': '122.79 - 127.93', 'regularMarketDayLow': 122.79, 'regularMarketVolume': 116307692, 'regularMarketPreviousClose': 121.26, 'bid': 128.74, 'ask': 128.75, 'bidSize': 10, 'askSize': 11, 'exchange': 'NMS', 'market': 'us_market', 'messageBoardId': 'finmb_24937', 'fullExchangeName': 'NasdaqGS', 'shortName': 'Apple Inc.', 'regularMarketChangePercent': 5.385122, 'displayName': 'Apple', 'symbol': 'AAPL'}
Put the data into a pandas DataFrame:
df = pd.DataFrame(data['ms_technology']['quotes'])
Retrieve multiple screeners at once:
data = s.get_screeners(['ms_technology', 'ms_utilities', 'ms_real_estate'])
Finally, view the list of available predefined screeners with the following:
# Will return a list
s.available_screeners

You're looking for 'industry' to get a more granular description:
msft= yf.Ticker("MSFT")
print(msft.info['industry'])
print(msft.info) to view the json file to see what is available. yFinance does not provide the same amount of information for all ticker symbols.

How to convert json into a pandas dataframe?

I'm trying to covert an api response from json to a dataframe in pandas. the problem I am having is that de data is nested in the json format and I am not getting the right columns in my dataframe.
The data is collect from a api with the following format:
{'tickets': [{'url': 'https...',
'id': 1,
'external_id': None,
'via': {'channel': 'web',
'source': {'from': {}, 'to': {}, 'rel': None}},
'created_at': '2020-05-01T04:16:33Z',
'updated_at': '2020-05-23T03:02:49Z',
'type': 'incident',
'subject': 'Subject',
'raw_subject': 'Raw subject',
'description': 'Hi, this is the description',
'priority': 'normal',
'status': 'closed',
'recipient': None,
'requester_id': 409467360874,
'submitter_id': 409126461453,
'assignee_id': 409126461453,
'organization_id': None,
'group_id': 360009916453,
'collaborator_ids': [],
'follower_ids': [],
'email_cc_ids': [],
'forum_topic_id': None,
'problem_id': None,
'has_incidents': False,
'is_public': True,
'due_at': None,
'tags': ['tag_1',
'tag_2',
'tag_3',
'tag_4'],
'custom_fields': [{'id': 360042034433, 'value': 'value of the first custom field'},
{'id': 360041487874, 'value': 'value of the second custom field'},
{'id': 360041489414, 'value': 'value of the third custom field'},
{'id': 360040980053, 'value': 'correo_electrónico'},
{'id': 360040980373, 'value': 'suscribe_newsletter'},
{'id': 360042046173, 'value': None},
{'id': 360041028574, 'value': 'product'},
{'id': 360042103034, 'value': None}],
'satisfaction_rating': {'score': 'unoffered'},
'sharing_agreement_ids': [],
'comment_count': 2,
'fields': [{'id': 360042034433, 'value': 'value of the first custom field'},
{'id': 360041487874, 'value': 'value of the second custom field'},
{'id': 360041489414, 'value': 'value of the third custom field'},
{'id': 360040980053, 'value': 'correo_electrónico'},
{'id': 360040980373, 'value': 'suscribe_newsletter'},
{'id': 360042046173, 'value': None},
{'id': 360041028574, 'value': 'product'},
{'id': 360042103034, 'value': None}],
'followup_ids': [],
'ticket_form_id': 360003608013,
'deleted_ticket_form_id': 360003608013,
'brand_id': 360004571673,
'satisfaction_probability': None,
'allow_channelback': False,
'allow_attachments': True},
What I already tried is the following: I have converted the JSON format into a dict as following:
x = response.json()
df = pd.DataFrame(x['tickets'])
But I'm struggling with the output. I don't know how to get a correct, ordered, normalized dataframe.
(I'm new in this :) )

Let's supose you get your request data by this code r = requests.get(url, auth)
Your data ins't clear yet, so let's get a dataframe of it data = pd.read_json(json.dumps(r.json, ensure_ascii = False))
But, probably you will get a dataframe with one single row.
When I faced a problem like this, I wrote this function to get the full data:
listParam = []
def listDict(entry):
if type(entry) is dict:
listParam.append(entry)
elif type(entry) is list:
for ent in entry:
listDict(ent)
Because your data looks like a dict because of {'tickets': ...} you will need to get the information like that:
listDict(data.iloc[0][0])
And then,
pd.DataFrame(listParam)
I can't show the results because you didn't post the complete data nor told where I can find the data to test, but this will probably work.

You have to convert the json to dictionary first and then convert the dictionary value for key 'tickets' into dataframe.
file = open('file.json').read()
ticketDictionary = json.loads(file)
df = pd.DataFrame(ticketDictionary['tickets'])
'file.json' contains your data here.
df now contains your dataFrame in this format.
For the lists within the response you can have separate dataframes if required:
for field in df['fields']:
df = pd.DataFrame(field)
It will give you this for lengths:
id value
0 360042034433 value of the first custom field
1 360041487874 value of the second custom field
2 360041489414 value of the third custom field
3 360040980053 correo_electrónico
4 360040980373 suscribe_newsletter
5 360042046173 None
6 360041028574 product
7 360042103034 None
This can be one way to structure as you haven't mentioned the exact expected format.

writing multiple nested dictionaries on a CSV file

I have a ton of dicts that I have converted from twitter JSON data. Now, I want to turn them into one .csv file. I searched the site but the solutions seem to fit dicts with very few values or dicts that already exist. In my case the number of keys is a little higher, and I also have to go through an iterative process to turn each JSON file to a dict. In other words, I want to write each of my JSON files on my .csv file as soon as I turn them into a dict file in an iterative process.
Here's my code so far:
json_path = "C://Users//msalj//OneDrive//Desktop//pypr//Tweets"
for filename in os.listdir(json_path):
with open(filename, 'r') as infh:
for data in json_parse(infh):
and here is a sample of my converted JSON files:
{'actor': {'displayName': 'RIMarkable',
'favoritesCount': 0,
'followersCount': 0,
'friendsCount': 0,
'id': 'id:twitter.com:3847371',
'image': 'Picture_13.png',
'languages': ['en'],
'link': 'ht........ble',
'links': [{'href': 'htt.....m', 'rel': 'me'}],
'listedCount': 0,
'objectType': 'person',
'postedTime': '2007-01-09T02:53:35.000Z',
'preferredUsername': 'RIMarkable',
'statusesCount': 0,
'summary': 'The Official, Unofficial BlackBerry Weblog',
'twitterTimeZone': 'Eastern Time (US & Canada)',
'utcOffset': '0',
'verified': False},
'body': 'Jim Balsillie To Present At JP Morgan Technology Conference: Research in Motion co-CEO, Jim Balsillie,.. ht...qo',
'generator': {'displayName': 'twitterfeed', 'link': 'htt......om'},
'gnip': {'matching_rules': [{'tag': None, 'value': '"JP Morgan"'}]},
'id': 'tag:search.twitter.com,2005:66178882',
'link': 'ht...82',
'object': {'id': 'object:search.twitter.com,2005:66178882',
'link': 'ht.....82',
'objectType': 'note',
'postedTime': '2007-05-16T19:00:24.000Z',
'summary': 'Jim Balsillie To Present At JP Morgan Technology Conference: Research in Motion co-CEO, Jim Balsillie,.. ht......qo'},
'objectType': 'activity',
'postedTime': '2007-05-16T19:00:24.000Z',
'provider': {'displayName': 'Twitter',
'link': 'ht......m',
'objectType': 'service'},
'retweetCount': 0,
'twitter_entities': {'hashtags': [],
'urls': [{'expanded_url': None,
'indices': [105, 130],
'url': 'htt.......5qo'}],
'user_mentions': []},
'verb': 'post'}
Can anybody help me with its coding? Thanks a lot!

With various depths, if you want to keep everything, this problem gets a little more complicated.
What I've done with this issue is flattened the dictionary.
def flatten_dict(input_dict):
flat_dict = {}
for k,v in input_dict.items():
if isinstance(v, dict):
for k2, v2 in flatten_dict.items():
flat_dict[k2] = v2
elif any([isinstance(v, c_type) for c_type in [list, tuple]]):
for index, i in enumerate(v):
flat_dict["{}-{}".format(k, index)] = i
elif any([isinstance(v, c_type) for c_type in [str, int, float]]):
flat_dict[k] = v
else:
print("unknwon type, add handling for: {}".format(type(v)))
return flat_dict
then I'll use the first json instance to create a header row:
header_row = [k for k in flatten_dict(row1)]
and print the header row to the csv
",".join(header_row)
and print the data in the same order for each json row afterwards:
for row in rows:
flat_row = flatten_dict(row)
print_row = ",".join([flat_row[header] if header in flat_row else "" for header in header_row])

scraping Json with python 3

Here is the scirpt:
from bs4 import BeautifulSoup as bs4
import requests
import json
from lxml import html
from pprint import pprint
import re
def get_data():
url = 'https://sports.bovada.lv//baseball/mlb/game-lines-market-group'
r = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.103 Safari/537.36"})
html_bytes = r.text
soup = bs4(html_bytes, 'lxml')
# res = soup.findAll('script') # find all scripts..
pattern = re.compile(r"swc_market_lists\s+=\s+(\{.*?\})")
script = soup.find("script", text=pattern)
return script.text[23:]
test1 = get_data()
data = json.loads(test1)
for item1 in data['items']:
data1 = item1['itemList']['items']
for item2 in data1:
pitch_a = item2['opponentAName']
pitch_b = item2['opponentBName']
## group = item2['displayGroups']
## for item3 in group:
## new_il = item3['itemList']
## for item4 in new_il:
## market = item4['description']
## oc = item4['outcomes']
print(pitch_a,pitch_b)
##for items in data['items']:
## pos = items['itemList']['items']
## for item in pos:
## work = item['competitors']
## pitcher_a = item['opponentAName']
## pitcher_b = item['opponentBName']
## group = item['displayGroups']
## for item, item2 in zip(work,group):
## team = item['abbreviation']
## place = item['type']
## il2 = item2['itemList']
## for item in il2:
## ml = item['description']
## print(team,place,pitcher_a,pitcher_b,ml)
I have been trying to scrape
team abbrev = ['items']['itemList']['items']['competitors']['abbreviation']
home_away = ['items']['itemList']['items']['competitors']['type']
team pitcher home = ['items']['itemList']['items']['opponentAName']
team pitcher away = ['items']['itemList']['items']['opponentBName']
moneyline american odds = ['items']['itemList']['items']['displayGroups']['itemList']['outcomes']['price']['american']
Total runs = ['items']['itemList']['items']['displayGroups']['itemList']['outcomes']['price']['handicap']
Part of the Json pprinted:
[{'baseLink': '/baseball/mlb/game-lines-market-group',
'defaultType': True,
'description': 'Game Lines',
'id': '136',
'itemList': {'items': [{'LIVE': True,
'atmosphereLink': '/api/atmosphere/eventNotification/events/A/3149961',
'awayTeamFirst': True,
'baseLink': '/baseball/mlb/minnesota-twins-los-angeles-angels-201805112207',
'competitionId': '24736',
'competitors': [{'abbreviation': 'LAA',
'description': 'Los Angeles Angels',
'id': '3149961-1642',
'rotationNumber': '978',
'shortName': 'Angels',
'type': 'HOME'},
{'abbreviation': 'MIN',
'description': 'Minnesota Twins',
'id': '3149961-9990',
'rotationNumber': '977',
'shortName': 'Twins',
'type': 'AWAY'}],
'denySameGame': 'NO',
'description': 'Minnesota Twins # Los Angeles Angels',
'displayGroups': [{'baseLink': '/baseball/mlb/game-lines-market-group',
'defaultType': True,
'description': 'Game Lines',
'id': '136',
'itemList': [{'belongsToDefault': True,
'columns': 'H2Columns',
'description': 'Moneyline',
'displayGroups': '136,A-136',
'id': '46892277',
'isInRunning': True,
'mainMarketType': 'MONEYLINE',
'mainPeriod': True,
'marketTypeGroup': 'MONEY_LINE',
'notes': '',
'outcomes': [{'competitorId': '3149961-9990',
'description': 'Minnesota '
'Twins',
'id': '211933276',
'price': {'american': '-475',
'decimal': '1.210526',
'fractional': '4/19',
'id': '1033002124',
'outcomeId': '211933276'},
'status': 'OPEN',
'type': 'A'},
{'competitorId': '3149961-1642',
'description': 'Los '
'Angeles '
'Angels',
'id': '211933277',
'price': {'american': '+310',
'decimal': '4.100',
'fractional': '31/10',
'id': '1033005679',
'outcomeId': '211933277'},
'status': 'OPEN',
'type': 'H'}],
'periodType': 'Live '
'Match',
'sequence': '14',
'sportCode': 'BASE',
'status': 'OPEN',
'type': 'WW'},
{'belongsToDefault': True,
'columns': 'H2Columns',
'description': 'Runline',
'displayGroups': '136,A-136',
'id': '46892287',
'isInRunning': True,
'mainMarketType': 'SPREAD',
'mainPeriod': True,
'marketTypeGroup': 'SPREAD',
'notes': '',
'outcomes': [{'competitorId': '3149961-9990',
'description': 'Minnesota '
'Twins',
'id': '211933278',
'price': {'american': '+800',
'decimal': '9.00',
'fractional': '8/1',
'handicap': '-1.5',
'id': '1033005677',
'outcomeId': '211933278'},
'status': 'OPEN',
'type': 'A'},
{'competitorId': '3149961-1642',
'description': 'Los '
'Angeles '
'Angels',
'id': '211933279',
'price': {'american': '-2000',
'decimal': '1.050',
'fractional': '1/20',
'handicap': '1.5',
'id': '1033005678',
'outcomeId': '211933279'},
'status': 'OPEN',
'type': 'H'}],
'periodType': 'Live '
'Match',
'sequence': '14',
'sportCode': 'BASE',
'status': 'OPEN',
'type': 'SPR'}],
'link': '/baseball/mlb/game-lines-market-group'}],
'feedCode': '13625145',
'id': '3149961',
'link': '/baseball/mlb/minnesota-twins-los-angeles-angels-201805112207',
'notes': '',
'numMarkets': 2,
'opponentAId': '214704',
'opponentAName': 'Tyler Skaggs (L)',
'opponentBId': '215550',
'opponentBName': 'Lance Lynn (R)',
'sport': 'BASE',
'startTime': 1526090820000,
'status': 'O',
'type': 'MLB'},
There are a few different loops I had started in the script above but either of them are working out the way I would like.
away team | away moneyline | away pitcher | Total Runs | and repeat for Home Team is what I would like it to be eventually. I can write to csv once it is parsed the proper way.
Thank you for the fresh set of eyes, I've been working on this for the better part of a day trying to figure out the best way to access the content I would like. If Json is not the best way and bs4 works better I would love to hear your opinion

There's no simple answer to your problem. Scraping data requires you to carefully assess the data you are dealing with, work out where the parts you want to extract are located and figure out how to effectively store the data you extract.
Try printing the data in your loops to visualise what is happening in your code (or try debugging). From there its easy to figure out it if you're iterating over what you expect. Look for patterns throughout the input data to help organise the data you extract.
To help yourself, you should give your variables descriptive names, separate your code into logical chunks and add comments when it starts to get complicated.
Here's some working code, but I encourage you to try what I told you above, then if you're still stuck look below for guidance.
output = {}
root = data['items'][0]
for game_line in root['itemList']['items']:
# Create a temporary dict to store the data for this gameline
team_data = {}
# Get competitors
competitors = game_line['competitors']
for team in competitors:
team_type = team['type'] # either HOME or AWAY
# Create a new dict to store data for each team
team_data[team_type] = {}
team_data[team_type]['abbreviation'] = team['abbreviation']
team_data[team_type]['name'] = team['description']
# Get MoneyLine and Total Runs
for item in game_line['displayGroups'][0]['itemList']:
for outcome in item['outcomes']:
team_type = outcome['type'] # either A or H
team_type = 'HOME' if team_type == 'H' else 'AWAY'
if item['mainMarketType'] == 'MONEYLINE':
team_data[team_type]['moneyline'] = outcome['price']['american']
elif item['mainMarketType'] == 'SPREAD':
team_data[team_type]['total runs'] = outcome['price']['handicap']
# Get the pitchers
team_data['HOME']['pitcher'] = game_line['opponentAName']
team_data['AWAY']['pitcher'] = game_line['opponentBName']
# For each gameline, add the teamdata we gathered to the output dict
output[game_line['description']] = team_data
This produces like:
{
'Atlanta Braves # Miami Marlins': {
'AWAY': {
'abbreviation': 'ATL',
'moneyline': '-130',
'name': 'Atlanta Braves',
'pitcher': 'Mike Soroka (R)',
'total runs': '-1.5'
},
'HOME': {
'abbreviation': 'MIA',
'moneyline': '+110',
'name': 'Miami Marlins',
'pitcher': 'Jarlin Garcia (L)',
'total runs': '1.5'
}
},
'Boston Red Sox # Toronto Blue Jays': {
'AWAY': {
'abbreviation': 'BOS',
'moneyline': '-133',
'name': 'Boston Red Sox',
'pitcher': 'David Price (L)',
'total runs': '-1.5'
},
'HOME': {
'abbreviation': 'TOR',
'moneyline': '+113',
'name': 'Toronto Blue Jays',
'pitcher': 'Marco Estrada (R)',
'total runs': '1.5'
}
},
}

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping a particular dictionary format to a dataframe in python - python

Related

How to Match two APIs to update one API dataset using Python

how to get a list of stock tickers by entering a sector name

How to convert json into a pandas dataframe?

writing multiple nested dictionaries on a CSV file

scraping Json with python 3

Categories

Resources