I have the following code:
url = requests.get("http://www.ucdenver.edu/pages/ucdwelcomepage.aspx")
soup = BeautifulSoup(res.content, 'html5lib')
scripts = soup.select('script', {"type":"application/ld+json"})
scripts = [script for script in scripts] #for each script in the script, from all scripts found
>! print(scripts)
for script in scripts:
script.get(res)
print(script)
and from this code I got the result(s):
I want to get into the departments array to capture two elements,
(there are multiple departments in "departments")
{
"#context": "https://schema.org/",
"#type": "Organization",
"url": "https://www.ucdenver.edu",
"logo": "https://www.ucdenver.edu/images/default-source/global-theme-images/cu_logo.png",
"name": "University of Colorado Denver",
"alternateName": "CU Denver",
"telephone": "1+ 303-315-5969",
"address": {
"#type": "PostalAddress",
"streetAddress": "1201 Larimer Street",
"addressLocality": "Denver",
"addressRegion": "CO",
"postalCode": "80204",
"addressCountry": "US"
},
"department": [{
"name": "Center for Undergraduate Exploration and Advising",
"email": "mailto:CUEA#ucdenver.edu",
"telephone": "1+ 303-315-1940",
"url": "https://www.ucdenver.edu/center-for-undergraduate-exploration-and-advising",
"address": [{
"#type": "PostalAddress",
"streetAddress": "1201 Larimer Street #1113",
"addressLocality": "Denver",
"addressRegion": "CO",
"postalCode": "80204",
"addressCountry": "US"
}]
},
from the object I only want to capture "name" and "url".
This is my first time playing with web scraping, but i'm not too sure how you get into "department": [{ to then capture the two elements I want.
Once you get back the JSON output you've shown as a Python dict and stored it in a variable called data, for example, you can do:
result = []
for department in data["department"]:
result.append({"name": department["name"], "url": department["url"]})
print(result) # prints out [{"name": "Center for Undergraduate Exploration and Advising", "url": "https://www.ucdenver.edu/center-for-undergraduate-exploration-and-advising"}, {"name": "another name", "url": "another url"}, ...]
This worked for me:
from bs4 import BeautifulSoup
import requests
import json
res = requests.get("http://www.ucdenver.edu/pages/ucdwelcomepage.aspx")
soup = BeautifulSoup(res.content, 'html5lib')
scripts = soup.find_all(attrs={"type":"application/ld+json"})
for s in scripts:
content = s.contents[0] # get the text of the script node
j = json.loads(content) # parse it as JSON into a Python data structure
for dept in j["department"]:
print(">>>", dept["name"], dept["url"])
You first extract the text of the script node. Then convert that text using the json package to a Python data structure. Then you can iterate through the data using a for-loop.
Related
I am a rookie at Python and I have data files that I would like to convert from JSON to CSV. The issues are that my code returns an error I am unable to resolve and the data varies from file to file and I would like to have one script that can be applied to multiple files by just changing the file location. I would like to not hard code company name and company type but i don't know how to go about that. The data is structured as follows:
{
"company_name": "Google",
"company_type": "Public",
"employees": [{
"staff": [{
"name": "John Doe",
"type": "FTE",
"id": "1111111111",
"region": "Northeast"
}, {
"name": "Jane Doe",
"type": "FTE",
"id": "222222222",
"region": "Northwest"
}],
"setup": [{
"description": "Onsite",
"location": "New York City"
}, {
"description": "Hybrid",
"location": "Seattle"
}],
"role": [{
"description": "Business Analyst",
"salary": "70000"
}, {
"description": "Manager",
"salary": "90000"
}]
}, {
"contractors": [{
"name": "Jessica Smith",
"type": "PTE",
"id": "333333333",
"region": "Southeast"
}],
"setup": [{
"description": "Remote",
"location": "Miami"
}],
"role": [{
"description": "Project Manager",
"salary": "80000"
}]
}]
}
The code I have so far is:
import json
import csv
import ijson
file = open("C:/Users/User1/sample_file.json","w")
file_writer = csv.writer(file)
file_writer.writerow(("Company Name","Company Type","Name","Type","ID","Region","Description","Location","Description","Salary"))
with open("C:/Users/User1/sample_file.json","rb") as f:
company_name = "Google"
company_type = "Public"
for record in ijson.items(f,"employees.item"):
name = record['staff'][0]['name']
type = record['staff'][0]['type']
id = record['staff'][0]['id']
region = record['staff'][0]['region']
description = record['setup'][0]['description']
location = record['setup'][0]['location']
description = record['role'][0]['description']
salary = record['role'][0]['salary']
file_writer.writerow((comapny_name, company_type, name, type, id, region, description, location, description, salary))
file.close()
Any help is greatly appreciated.
Assuming that all of your files have the same general structure, using a csv.DictWriter should work. Just iterate through the employee sections creating a single dictionary to represent each employee and call writer.writerow() once all of the data has been collected.
For example:
import csv
import json
data = json.load(open(filename))
columns = ["company name","company type","name","type","id","region","description","location","salary"]
def convert(data, headers):
with open("employees.csv", "wt") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=headers, extrasaction="ignore", restval=None)
writer.writeheader()
for emp_type in data["employees"]:
lst = []
for _, v in emp_type.items():
for i,x in enumerate(v):
if len(lst) <= i:
lst.append({"company name": data["company_name"],
"company type": data["company_type"]})
lst[i].update(x)
for item in lst:
writer.writerow(item)
convert(data, columns)
OUTPUT
company name,company type,name,type,id,region,description,location,salary
Google,Public,John Doe,FTE,1111111111,Northeast,Business Analyst,New York City,70000
Google,Public,Jane Doe,FTE,222222222,Northwest,Manager,Seattle,90000
Google,Public,Jessica Smith,PTE,333333333,Southeast,Project Manager,Miami,80000
I'm trying to scrape something from a site using python. For example the views on this video (the url) it always returns "None". What am I doing wrong? here is the code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.youtube.com/watch?v=1OfK8UmLMl0&ab_channel=HitraNtheUnnecessaryProgrammer'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
views = soup.body.find(class_='view-count style-scope ytd-video-view-count-renderer')
print(views)
Thanks!
(btw when I try the code shown in the video it works fine)
The page is loaded dynamically, requests doesn't support dynamically loaded pages. However, the data is available in JSON format, you can use the re/json modules to get the correct data.
For example, to get the "view count":
import re
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.youtube.com/watch?v=1OfK8UmLMl0&ab_channel=HitraNtheUnnecessaryProgrammer"
soup = BeautifulSoup(requests.get(url).text, "html.parser")
# We locate the JSON data using a regular-expression pattern
data = re.search(r"var ytInitialData = ({.*?});", soup).group(1)
data = json.loads(data)
print(
data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][0][
"videoPrimaryInfoRenderer"
]["viewCount"]["videoViewCountRenderer"]["viewCount"]["simpleText"]
)
Output:
124 views
The variable data contains all the data in a Python dictionary (dict) to print all the data you can use:
print(json.dumps(data, indent=4))
Output (truncated):
{
"responseContext": {
"serviceTrackingParams": [
{
"service": "CSI",
"params": [
{
"key": "c",
"value": "WEB"
},
{
"key": "cver",
"value": "2.20210701.07.00"
},
{
"key": "yt_li",
"value": "0"
},
{
"key": "GetWatchNext_rid",
"value": "0x1d62a299beac9e1f"
}
]
},
{
"service": "GFEEDBACK",
"params": [
{
"key": "logged_in",
"value": "0"
},
{
"key": "e",
"value": "24037443,24058293,24058128,24003103,24042870,23882685,24023960,23944779,24027649,24046896,24059898,24049577,23983296,23966208,24056265,23891346,1714258,24049575,24045412,24003105,23999405,24051884,23891344,23986022,24049573,24056839,24053866,24058240,23744176,23998056,24010336,24037586,23934970,23974595,23735348,23857950,24036947,24051353,24038425,23990875,24052245,24063702,24058380,23983813,24058812,24026834,23996830,23946420,24001373,24049820,24030040,24062848,23968386,24027689,24004644,23804281,24049569,23973490,24044110,23884386,24012512,24044124,24059521,23918597,24007246,24049567,24022729,24037794"
}
]
},
{
"service": "GUIDED_HELP",
"params": [
{
"key": "logged_in",
"value": "0"
}
]
},
{
"service": "ECATCHER",
"params": [
{
"key": "client.version",
"value": "2.20210701"
},
{
"key": "client.name",
"value": "WEB"
}
]
}
],
"mainAppWebResponseContext": {
"loggedOut": true
},
"webResponseContextExtensionData": {
"ytConfigData": {
"visitorData": "CgtoanprT1pPbmtWTSjYk46HBg%3D%3D",
"rootVisualElementType": 3832
},
I usually try to view the API requests (from the network tab on dev tools) when a site is dynamically loaded. I was successful with sites such as udemy, skillshare and few others but not with youtube. so in such case, I would use the youtube official API. which is quite easy to use and have plenty of code samples on github. with that you just request your data and get a json response. that you can convert to a dictionary with response.json(). or another option would be using selenium which is not a solution I like and it's pretty resource and time consuming. requesting from API is faster than scraping or any other solution on earth. when something doesn't provide an API, you need scraping
I am fairly new to using APIs in python and I am trying to create a system that outputs data from previous motorsport races. I have sent requests to an API, but I am struggling to get it to just output one specific piece of data (eg. time, location). I get this when I just print the raw JSON data sent.
{
"MRData": {
"RaceTable": {
"Races": [
{
"Circuit": {
"Location": {
"country": "Spain",
"lat": "41.57",
"locality": "Montmeló",
"long": "2.26111"
},
"circuitId": "catalunya",
"circuitName": "Circuit de Barcelona-Catalunya",
"url": "http://en.wikipedia.org/wiki/Circuit_de_Barcelona-Catalunya"
},
"date": "2020-08-16",
"raceName": "Spanish Grand Prix",
"round": "6",
"season": "2020",
"time": "13:10:00Z",
"url": "https://en.wikipedia.org/wiki/2020_Spanish_Grand_Prix"
}
],
"round": "6",
"season": "2020"
},
"limit": "30",
"offset": "0",
"series": "f1",
"total": "1",
"url": "http://ergast.com/api/f1/2020/6.json",
"xmlns": "http://ergast.com/mrd/1.4"
}
}
Just to get to grips with APIs I am simply trying to output a simple piece of data of a specific race, and once I can do that, I'll be able to scale it up and output all sorts of data. I'd assumed it would just be as simple as typing print(data['time']) (as seen below) but I get an error message saying this:
KeyError: 'time'
My source code:
import requests
response = requests.get("http://ergast.com/api/f1/2020/6.json")
data = response.json()
print (data["time"])
Any help is appreciated!
Like this...
import json
data = """{
"MRData":{
"xmlns":"http://ergast.com/mrd/1.4",
"series":"f1",
"url":"http://ergast.com/api/f1/2020/6.json",
"limit":"30",
"offset":"0",
"total":"1",
"RaceTable":{
"season":"2020",
"round":"6",
"Races":[
{
"season":"2020",
"round":"6",
"url":"https://en.wikipedia.org/wiki/2020_Spanish_Grand_Prix",
"raceName":"Spanish Grand Prix",
"Circuit":{
"circuitId":"catalunya",
"url":"http://en.wikipedia.org/wiki/Circuit_de_Barcelona-Catalunya",
"circuitName":"Circuit de Barcelona-Catalunya",
"Location":{
"lat":"41.57",
"long":"2.26111",
"locality":"Montmeló",
"country":"Spain"
}
},
"date":"2020-08-16",
"time":"13:10:00Z"
}
]
}
}
}"""
jsonData = json.loads(data)
Races is an array, in this case there is only one race so you would desigate it as ["Races"][0]
print(jsonData["MRData"]["RaceTable"]["Races"][0]["time"])
data['time'] would work if you had a flat dictionary, but you have a nested dicts/list structure, so:
data["MRData"]["RaceTable"]["Races"][0]["time"]
data["MRData"] returns another dict, which has a key "RaceTable". The value of this key is again a dictionary which has a key "Races". The value of this is a list of races, of which you only have one. The races are again dicts which have the key time.
I am trying to find a specific ID to an altcoin, but not sure how to do it. When I print, I get a very long json script and I get lost in trying to find it. Is there an easier way?
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import time
cmc = requests.get('https://coinmarketcap.com/')
soup = BeautifulSoup(cmc.content, 'html.parser')
print(soup.prettify())
The output I want is to determine the exact id corresponding to the altcoin. The output below is for one coin, but it is a long list, and I can not easily find the exact one without manually looking.
{"id":1,"name":"Bitcoin","symbol":"BTC","slug":"bitcoin","max_supply":21000000,"circulating_supply":18614718,"total_supply":18614718,"last_updated":"2021-01-30T15:00:02.000Z","quote":{"USD":{"price":34177.31601866782,"volume_24h":83208963467.24487,"percent_change_1h":1.15037986,"percent_change_24h":-10.87555443,"percent_change_7d":7.03677315,"percent_change_30d":19.84946991,"market_cap":636201099684.3843,"last_updated":"2021-01-30T15:00:02.000Z"}},"rank":1,"noLazyLoad":true}
I took a closer look at the HTML.
It appears that the JSON string data you seek is inside of a <script> tag with id "__NEXT_DATA__".
I'm not that familiar with BeautifulSoup so a more elegant way may exist to get the data. Here is the code I used.
cmc = requests.get('https://coinmarketcap.com/')
soup = BeautifulSoup(cmc.content, 'html.parser')
for item in soup.select('script[id="__NEXT_DATA__"]'):
data = json.loads(item.string) # load JSON string as a dict
desired_data = data["props"]["initialState"]["cryptocurrency"]["listingLatest"][
"data"
]
print(
json.dumps( # pretty output string
desired_data,
indent=2,
),
)
TRUNCATED OUTPUT:
[
{
"id": 1,
"name": "Bitcoin",
"symbol": "BTC",
"slug": "bitcoin",
"max_supply": 21000000,
"circulating_supply": 18614718,
"total_supply": 18614718,
"last_updated": "2021-01-30T14:51:02.000Z",
"quote": {
"USD": {
"price": 34138.18238095427,
"volume_24h": 83651976977.0413,
"percent_change_1h": 1.36922474,
"percent_change_24h": -9.82670796,
"percent_change_7d": 6.33079444,
"percent_change_30d": 19.72629419,
"market_cap": 635472638054.0323,
"last_updated": "2021-01-30T14:51:02.000Z"
}
},
"rank": 1,
"noLazyLoad": true
},
{
"id": 1027,
"name": "Ethereum",
"symbol": "ETH",
"slug": "ethereum",
"max_supply": null,
"circulating_supply": 114465285.999,
"total_supply": 114465285.999,
"last_updated": "2021-01-30T14:51:02.000Z",
"quote": {
"USD": {
"price": 1364.155096452962,
"volume_24h": 38819994919.48616,
"percent_change_1h": 1.95180621,
"percent_change_24h": -3.86551103,
"percent_change_7d": 10.22893483,
"percent_change_30d": 85.96783538,
"market_cap": 156148403262.48172,
"last_updated": "2021-01-30T14:51:02.000Z"
}
},
"rank": 2,
"noLazyLoad": true
},…
I'm using the national weather service API and when you use a specific URL you get JSON data back. My program so far grabs everything including 155 hours of weather data.
Simply put I'm trying to parse the data and grab the weather for the
latest hour but everything is in a nested data structure.
My code, JSON data, and more information are below. Any help is appreciated.
import requests
import json
def get_current_weather(): #This method returns json data from the api
url = 'https://api.weather.gov/gridpoints/*office*/*any number,*any number*/forecast/hourly'
response = requests.get(url)
full_data = response.json()
return full_data
def main(): #Prints the information grabbed from the API
print(get_current_weather())
if __name__ == "__main__":
main()
In the JSON response, I get there are 3 layers before you get to the 'shortForecast' data that I'm trying to get. The first nest is 'properties, everything before it is irrelevant to my program. The second nest is 'periods' and each period is a new hour, 0 being the latest. Lastly, I just need to grab the 'shortForcast' in the first period or periods[0].
{
"#context": [
"https://geojson.org/geojson-ld/geojson-context.jsonld",
{
"#version": "1.1",
"wx": "https://api.weather.gov/ontology#",
"geo": "http://www.opengis.net/ont/geosparql#",
"unit": "http://codes.wmo.int/common/unit/",
"#vocab": "https://api.weather.gov/ontology#"
}
],
"type": "Feature",
"geometry": {
"type": "Polygon",
"coordinates": [
[
*data I'm not gonna add*
]
]
},
"properties": {
"updated": "2021-02-11T05:57:24+00:00",
"units": "us",
"forecastGenerator": "HourlyForecastGenerator",
"generatedAt": "2021-02-11T07:12:58+00:00",
"updateTime": "2021-02-11T05:57:24+00:00",
"validTimes": "2021-02-10T23:00:00+00:00/P7DT14H",
"elevation": {
"value": ,
"unitCode": "unit:m"
},
"periods": [
{
"number": 1,
"name": "",
"startTime": "2021-02-11T02:00:00-05:00",
"endTime": "2021-02-11T03:00:00-05:00",
"isDaytime": false,
"temperature": 18,
"temperatureUnit": "F",
"temperatureTrend": null,
"windSpeed": "10 mph",
"windDirection": "N",
"icon": "https://api.weather.gov/icons/land/night/snow,40?size=small",
"shortForecast": "Chance Light Snow",
"detailedForecast": ""
},
{
"number": 2,
"name": "",
"startTime": "2021-02-11T03:00:00-05:00",
"endTime": "2021-02-11T04:00:00-05:00",
"isDaytime": false,
"temperature": 17,
"temperatureUnit": "F",
"temperatureTrend": null,
"windSpeed": "12 mph",
"windDirection": "N",
"icon": "https://api.weather.gov/icons/land/night/snow,40?size=small",
"shortForecast": "Chance Light Snow",
"detailedForecast": ""
},
OK, so I didn't want to edit everything again so this is the new get_current_weather method. I was able to get to 'periods but after that I'm still stumped. This is the new method.
def get_current_weather():
url = 'https://api.weather.gov/gridpoints/ILN/82,83/forecast/hourly'
response = requests.get(url)
full_data = response.json()
return full_data['properties'].get('periods')
For the dictionary object, you can access the nested elements by using indexing multiple times.
So, for your dictionary object, you can use the following to get the value for the key shortForecast for the first element in the list of dictionaries under key periods under the key properties in the main dictionary:
full_data['properties']['periods'][0]['shortForecast']