Calling back-end API of CNBC in python - python

As a followup to this question, how can I locate the XHR request which is used to retrieve the data from the back-end API on CNBC News in order to be able to scrape this CNBC search query?
The end goal is to have a doc with: headline, date, full article and url.
I have found this: https://api.sail-personalize.com/v1/personalize/initialize?pageviews=1&isMobile=0&query=coronavirus&qsearchterm=coronavirus
Which tells me I don't have access. Is there a way to access information anyway?

Actually my previous answer for you were addressing your question regarding the XHR request:
But here we go with a screenshot:
import requests
params = {
"queryly_key": "31a35d40a9a64ab3",
"query": "coronavirus",
"endindex": "0",
"batchsize": "100",
"callback": "",
"showfaceted": "true",
"timezoneoffset": "-120",
"facetedfields": "formats",
"facetedkey": "formats|",
"facetedvalue":
"!Press Release|",
"needtoptickers": "1",
"additionalindexes": "4cd6f71fbf22424d,937d600b0d0d4e23,3bfbe40caee7443e,626fdfcd96444f28"
}
goal = ["cn:title", "_pubDate", "cn:liveURL", "description"]
def main(url):
with requests.Session() as req:
for page, item in enumerate(range(0, 1100, 100)):
print(f"Extracting Page# {page +1}")
params["endindex"] = item
r = req.get(url, params=params).json()
for loop in r['results']:
print([loop[x] for x in goal])
main("https://api.queryly.com/cnbc/json.aspx")
Pandas DataFrame version:
import requests
import pandas as pd
params = {
"queryly_key": "31a35d40a9a64ab3",
"query": "coronavirus",
"endindex": "0",
"batchsize": "100",
"callback": "",
"showfaceted": "true",
"timezoneoffset": "-120",
"facetedfields": "formats",
"facetedkey": "formats|",
"facetedvalue":
"!Press Release|",
"needtoptickers": "1",
"additionalindexes": "4cd6f71fbf22424d,937d600b0d0d4e23,3bfbe40caee7443e,626fdfcd96444f28"
}
goal = ["cn:title", "_pubDate", "cn:liveURL", "description"]
def main(url):
with requests.Session() as req:
allin = []
for page, item in enumerate(range(0, 1100, 100)):
print(f"Extracting Page# {page +1}")
params["endindex"] = item
r = req.get(url, params=params).json()
for loop in r['results']:
allin.append([loop[x] for x in goal])
new = pd.DataFrame(
allin, columns=["Title", "Date", "Url", "Description"])
new.to_csv("data.csv", index=False)
main("https://api.queryly.com/cnbc/json.aspx")
Output: view online

Related

Incorporating pagination scraping into my script

url = "https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313&_nkw=electronics"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
names = soup.find_all("div", class_="s-item__title")
prices = soup.find_all("span", class_="s-item__price")
shippings = soup.find_all("span", class_="s-item__shipping s-item__logisticsCost"
for name,price,shipping in zip(names,prices,shippings):
print(name.text, price.text, shipping.text)
Right now, this script works perfectly. It prints everything that needs to be printed.
But... I want to be able to go to the next page and scrape everything off of there as well.
The class for the next page is "pagination__next icon-link"
I'm not sure how I would go about it.
Just iterate link by pagination url query value
base_url = 'https://www.ebay.com/sch/i.html?_from=R40&_nkw=electronics&_pgn='
for i in range(pages_count):
base_url+f'{i}'
# your code...
response = requests.get(url)
For correct parsing by category, due to the specifics of the displayed pages of the site, I advise you to refer to the pagination object for each request, look at the last page number and substitute it in the request
Take last number of available page on current page:
ol = soup.find("ol", class_="pagination__items")
lis = ol.find_all("li")
print(f"Last available number of post on current page {lis[-1].text}")
In order to collect all the information from all pages, you can use the while loop which dynamically paginates through all pages.
The while loop will be executed until there is a stop command, in our case, the command to end the loop will be to check for the presence of the next page, for which the CSS selector is responsible - ".pagination__next".
Also, there's a URL parameter that is responsible for pagination: _pgn which is used to increase page number by 1 and thus selects the next page:
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
See the full code in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
params = {
"_nkw": "electronics", # search query
"_pgn": 1 # page number
}
data = []
while True:
page = requests.get('https://www.ebay.com/sch/i.html', params=params, headers=headers, timeout=30)
soup = BeautifulSoup(page.text, 'lxml')
print(f"Extracting page: {params['_pgn']}")
print("-" * 10)
for products in soup.select(".s-item__info"):
title = products.select_one(".s-item__title span").text
price = products.select_one(".s-item__price").text
link = products.select_one(".s-item__link")["href"]
data.append({
"title" : title,
"price" : price,
"link" : link
})
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Nintendo DSi XL Japan Import Console & USB Charger - Pick Your Color TESTED",
"price": "$69.99",
"link": "https://www.ebay.com/itm/165773301243?hash=item2698dbd5fb:g:HFcAAOSwTdNhnqy~&amdata=enc%3AAQAHAAAA4MXRmWPDY6vBlTlYLy%2BEQPsi1HJM%2BFzt2TWJ%2BjCbK6Q2mreLV7ZpKmZOvU%2FMGqxY2oQZ91aPaHW%2FS%2BRCUW3zUKWDIDoN2ITF3ooZptkWCkd8x%2FIOIaR7t2rSYDHwQEFUD7N6wdnY%2Bh6SpljeSkCPkoKi%2FDCpU0YLOO3mpuLVjgO8GQYKhrlXG59BDDw8IyaayjRVdWyjh534fuIRToSqFrki97dJMVXE0LNE%2BtPmJN96WbYIlqmo4%2B278nkNigJHI8djvwHMmqYUBQhQLN2ScD%2FLnApPlMJXirqegMet0DZQ%7Ctkp%3ABk9SR7K0tsSSYQ"
},
{
"title": "Anbernic RG351P White, Samsung 64 GB SD Card AmberElec & Case",
"price": "$89.99",
"link": "https://www.ebay.com/itm/144690079314?hash=item21b0336652:g:8qwAAOSw93ZjO6n~&amdata=enc%3AAQAHAAAAoNGQWvtymUdp2cEYaKyfTAzWm0oZvBODZsm2oeHl3s%2F6jF9k3nAIpsQkpiZBFI657Cg53X9zAgExAxQAfmev0Bgh7%2FjEtC5FU8O5%2FfoQ3tp8XYtyKdoRy%2FwdebmsGKD%2FIKvW1lWzCNN%2FpSAUDLrPgPN9%2Fs8igeU7jqAT4NFn3FU7W4%2BoFV%2B2gNOj8nhxYlm3HZ6vm21T4P3IAA4KXJZhW2E%3D%7Ctkp%3ABk9SR7K0tsSSYQ"
},
{
"title": "New ListingWhite wii console ONLY Tested Working",
"price": "$24.99",
"link": "https://www.ebay.com/itm/385243730852?hash=item59b250d3a4:g:t3YAAOSwZBBjctqi&amdata=enc%3AAQAHAAAAoH9I%2BSQlJpKebgObGE7Idppe2cewzEiV0SdZ6pEu0sVpIJK5%2F3q15ygTFAdPRElY232LwDKIMXjkIwag1FUN76geBg2vCnPfd3x8BAHzXn%2B1u5zF9cBITLCuawKTYnfUeCYMavO4cBmpnsrvUOSokvnTacfB078MF95%2FH1sUQH%2BfIjDtPzFoFTJrTtKLINRlXZ9edD%2BVW%2FB2TLYZ%2FHMAHkE%3D%7Ctkp%3ABk9SR7K0tsSSYQ"
},
# ...
]
As an alternative, you can use Ebay Organic Results API from SerpApi. It's a paid API with a free plan that handles blocks and parsing on their backend.
Example code that paginates through all pages:
from serpapi import EbaySearch
from urllib.parse import (parse_qsl, urlsplit)
import os, json
params = {
"api_key": os.getenv("API_KEY"), # serpapi api key
"engine": "ebay", # search engine
"ebay_domain": "ebay.com", # ebay domain
"_nkw": "electronics", # search query
}
search = EbaySearch(params) # where data extraction happens
page_num = 0
data = []
while True:
results = search.get_dict() # JSON -> Python dict
if "error" in results:
print(results["error"])
break
for organic_result in results.get("organic_results", []):
link = organic_result.get("link")
price = organic_result.get("price")
data.append({
"price" : price,
"link" : link
})
page_num += 1
print(page_num)
next_page_query_dict = dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query))
current_page = results["serpapi_pagination"]["current"] # 1,2,3...
# looks for the next page data (_pgn):
if "next" in results.get("pagination", {}):
# if current_page = 20 and next_page_query_dict["_pgn"] = 20: break
if int(current_page) == int(next_page_query_dict["_pgn"]):
break
# update next page data
search.params_dict.update(next_page_query_dict)
else:
break
print(json.dumps(data, indent=2))
Output:
[
{
"price": {
"raw": "$169.00",
"extracted": 169.0
},
"link": "https://www.ebay.com/itm/113356737439?hash=item1a64968b9f:g:4qoAAOSwQypdKgT6&amdata=enc%3AAQAHAAAA4N8GJRRCbG8WIU7%2BzjrvsRMMmKaTEnA0l7Nz9nOWUUSin3gZ5Ho41Fc4A2%2FFLtlLzbb5UuTtU5s3Qo7Ky%2FWB%2FTEuDKBhFldxMZUzVoixZXII6T1CTtgG5YFJWs0Zj8QldjdM9PwBFuiLNJbsRzG38k7v1rJdg4QGzVUOauPxH0kiANtefqiBhnYHWZ0RfMqwh4S%2BbQ59JYQWSZjAefL61WYyNwkfSdrfcq%2BW2B7b%2BR8QEfynka5CE6g7YPpoWWp4Bk3IOvd4CZxAzTpgvOPoMMKPy0VCW1gPJDG4R2CsfDEv%7Ctkp%3ABk9SR56IpsWSYQ"
},
{
"price": {
"raw": "$239.00",
"extracted": 239.0
},
"link": "https://www.ebay.com/itm/115600879000?hash=item1aea596d98:g:F3YAAOSwsXxjbuYn&amdata=enc%3AAQAHAAAA4LuAhrdA4ahkT85Gf15%2FtEH9GBe%2B0qlDZfEt4p9O0YPmJZVPyq%2Fkuz%2FV86SF3%2B7SYY%2BlK04XQtCyS3NGyNi03GurFWx2dYwoKFUj2G7YsLw%2BalUKmdiv5bC3jJaRTnXuBOJGPXQxw2IwTHcvZ%2Fu8T7tEnYF5ih3HGMg69vCVZdVHqRa%2FYehvk14wVwj3OwBTVrNM8dq7keGeoLKUdYDHCMAH6Y4je4mTR6PX4pWFS6S7lJ8Zrk5YhyHQInwWYXwkclgaWadC4%2BLwOzUjcKepXl5mDnxUXe6pPcccYL3u8g4O%7Ctkp%3ABk9SR56IpsWSYQ"
},
# ...
]

How do i use the "next" url for post request?

So how does python call the "next" page in a post request?
I know what i neeed to do, but not sure how to implement, all the examples in youtube use a # and not a cursor, being a semi beginner i am a bit confused
This is my code so far:
def main_request(headers, url1, params):
response = requests.post(url1, headers=headers, json=params, verify=False)
jsonData = response.json()
has_next_key = False
nextKey = ""
if "next_key" in jsonData:
next = True
nextKey = jsonData["next"]
while has_next_key:
data = {"limit_count":500, "limit_size":10000,"curr_key":nextKey}
params = {"data":json.dumps(data, separators=(",", ":"))}
req = requests.post(url1, headers=headers, json=params, verify=False) ## this should do GET request for the third page and so on...
if "next_key" in req:
nextKey = req["next_key"]
print(nextKey) # this returns "3321" which is the value for "next_key" in second page
else:
has_next_key = False
# no next_key, stop the loop
This is the value is brings back at the end of each request
{
"data": [],
"metadata": {},
"links": [
{
"href": "https://us.api.insight.rapid7.com:443/vm/v4/integration/assets?page=0&size=2",
"rel": "first"
},
{
"href": "https://us.api.insight.rapid7.com:443/vm/v4/integration/assets?page=0&size=2",
"rel": "self"
},
{
"href": "https://us.api.insight.rapid7.com:443/vm/v4/integration/assets?page=1&size=2&cursor=1542252837:::_S:::12474375-34a7-40a3-9821-28db0b5cc90e-default-asset-10",
"rel": "next"
},
{
"href": "https://us.api.insight.rapid7.com:443/vm/v4/integration/assets?page=1097&size=2",
"rel": "last"
}
]
}
according to rapid7 support, i need to use the cursor value
Given your jsonData as input, you can use this code to get the next url and assign it to url1:
for item in jsonData.get("links", []):
if item["rel"] == "next":
url1 = item["href"]
break
This only finds the first url. If you need all urls, I'd recommend adding all urls to a list.
For examples:
links = [item["href"] for item in jsonData.get("links", []) if item["rel"] == "next"]

Why does this print an empty list and dictionary?

import requests
from bs4 import BeautifulSoup
import json
data = {
0:{
0:"title",
1:"dates",
2:"city/state",
3:"country"
},
1:{
0:"event",
1:"reps",
2:"prize"
},
2:{
0:"results"
}
}
url = "https://mms.kcbs.us/members/evr_search.php?org_id=KCBA"
response = requests.get(url).text
soup = BeautifulSoup(response, features='lxml')
all_data = []
for element in soup.find_all('div', class_="row"):
event = {}
for i, col in enumerate(element.find_all('div', class_='col-md-4')):
for j, item in enumerate(col.strings):
event[data[i][j]] = item
all_data.append(event)
print(json.dumps(all_data,indent=4))
heres a link to the website https://mms.kcbs.us/members/evr_search.php?org_id=KCBA
Im unsure why nothing gets added to the list and dictionaries
The data you see is loaded from external URL via JavaScript. To simulate the Ajax request you can use next example:
import json
import requests
from bs4 import BeautifulSoup
api_url = "https://mms.kcbs.us/members/evr_search_ol_json.php"
params = {
"otype": "TEXT",
"evr_map_type": "2",
"org_id": "KCBA",
"evr_begin": "6/16/2022",
"evr_end": "7/16/2022",
"evr_address": "",
"evr_radius": "50",
"evr_type": "269",
"evr_openings": "0",
"evr_region": "",
"evr_region_type": "1",
"evr_judge": "0",
"evr_keyword": "",
"evr_rep_name": "",
}
soup = BeautifulSoup(
requests.get(api_url, params=params).content, "html.parser"
)
data = {
0: {0: "title", 1: "dates", 2: "city/state", 3: "country"},
1: {0: "event", 1: "reps", 2: "prize"},
2: {0: "results"},
}
all_data = []
for element in soup.find_all("div", class_="row"):
event = {}
for i, col in enumerate(element.find_all("div", class_="col-md-4")):
for j, item in enumerate(col.strings):
event[data[i][j]] = item
all_data.append(event)
print(json.dumps(all_data, indent=4))
Prints:
[
{
"title": "Frisco BBQ Challenge",
"dates": "6/16/2022 - 6/18/2022",
"city/state": "Frisco, CO 80443",
"country": "UNITED STATES",
"event": "STATE CHAMPIONSHIP",
"reps": "Reps: BUNNY TUTTLE, RICH TUTTLE, MICHAEL WINTER",
"prize": "Prize Money: $13,050.00",
"results": "Results Not In"
},
{
"title": "York County BBQ Festival",
"dates": "6/17/2022 - 6/18/2022",
"city/state": "Delta, PA 17314",
"country": "UNITED STATES",
"event": "STATE CHAMPIONSHIP",
"reps": "Reps: ANGELA MCKEE, ROBERT MCKEE, LOUISE WEIDNER",
"prize": "Prize Money: $5,500.00",
"results": "Results Not In"
},
...and so on.

How to merge two api in python by id in python?

I want to create a dictionary with using 2 api. Can you guide me?
I want to create a dictionary with using 2 api. Can you guide me?
url = 'https://test.com/api/v1/'
tags = []
result=[]
response = requests.get(url)
results = json.loads(response.text)
for data in results['results']:
second API
url = 'https://test.com/api/v1/'+data['tags_id']
response = requests.get(url)
results = json.loads(response.text)
for data in results['tags']:
tags.append(data['title'])
result of first api
results: [
{
"title": "subject1",
"tags_id": "86111ae6",
},
{
"title": "subject2",
"tags_id": "86ae6",
}]
expected result
results: [
{
"title": "subject1",
"tags: ['a','b'],
},
{
"title": "subject2",
"tags": ['c','d','f'],
}]
second API
"tags": [
{
"title": 'a',
},
{
"title": 'b',
},
]
Since each "title" from the first API has exactly one "tags_id", I think what you want is something like:
url = 'https://test.com/api/v1/'
response = requests.get(url)
results = json.loads(response.text)
output = list()
for d in results['results']:
response = requests.get(f'https://test.com/api/v1/{d["tags_id"]}/show_tags')
result = json.loads(response.text)
output.append({"title": d["title"],
"tags": [t["title"] for t in result["tags"]]})

Glassdoor API Not Printing Custom Response

I have the following problem when I try to print something from this api. I'm trying to set it up so I can access different headers, then print specific items from it. But instead when I try to print soup it gives me the entire api response in json format.
import requests, json, urlparse, urllib2
from BeautifulSoup import BeautifulSoup
url = "apiofsomesort"
#Create Dict based on JSON response; request the URL and parse the JSON
#response = requests.get(url)
#response.raise_for_status() # raise exception if invalid response
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(url,headers=hdr)
response = urllib2.urlopen(req)
soup = BeautifulSoup(response)
print soup
When it prints it looks like the below:
{
"success": true,
"status": "OK",
"jsessionid": "0541E6136E5A2D5B2A1DF1F0BFF66D03",
"response": {
"attributionURL": "http://www.glassdoor.com/Reviews/airbnb-reviews-SRCH_KE0,6.htm",
"currentPageNumber": 1,
"totalNumberOfPages": 1,
"totalRecordCount": 1,
"employers": [{
"id": 391850,
"name": "Airbnb",
"website": "www.airbnb.com",
"isEEP": true,
"exactMatch": true,
"industry": "Hotels, Motels, & Resorts",
"numberOfRatings": 416,
"squareLogo": "https://media.glassdoor.com/sqll/391850/airbnb-squarelogo-1459271200583.png",
"overallRating": 4.3,
"ratingDescription": "Very Satisfied",
"cultureAndValuesRating": "4.4",
"seniorLeadershipRating": "4.0",
"compensationAndBenefitsRating": "4.3",
"careerOpportunitiesRating": "4.1",
"workLifeBalanceRating": "3.9",
"recommendToFriendRating": "0.9",
"sectorId": 10025,
"sectorName": "Travel & Tourism",
"industryId": 200140,
"industryName": "Hotels, Motels, & Resorts",
"featuredReview": {
"attributionURL": "http://www.glassdoor.com/Reviews/Employee-Review-Airbnb-RVW12111314.htm",
"id": 12111314,
"currentJob": false,
"reviewDateTime": "2016-09-28 16:44:00.083",
"jobTitle": "Employee",
"location": "",
"headline": "An amazing place to work!",
"pros": "Wonderful people and great culture. Airbnb really strives to make you feel at home as an employee, and everyone is genuinely excited about the company mission.",
"cons": "The limitations of Rails 3 and the company infrastructure make developing difficult sometimes.",
"overall": 5,
"overallNumeric": 5
},
"ceo": {
"name": "Brian Chesky",
"title": "CEO & Co-Founder",
"numberOfRatings": 306,
"pctApprove": 95,
"pctDisapprove": 5,
"image": {
"src": "https://media.glassdoor.com/people/sqll/391850/airbnb-brian-chesky.png",
"height": 200,
"width": 200
}
}
}]
}
}
I want to print out specific items like employers":name, industry etc...
You can load the JSON response into a dict then look for the values you want like you would in any other dict.
I took your data and saved it in an external JSON file to do a test since I don't have access to the API. This worked for me.
import json
# Load JSON from external file
with open (r'C:\Temp\json\data.json') as json_file:
data = json.load(json_file)
# Print the values
print 'Name:', data['response']['employers'][0]['name']
print 'Industry:', data['response']['employers'][0]['industry']
Since you're getting your data from an API something like this should work.
import json
import urlib2
url = "apiofsomesort"
# Load JSON from API
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(url, headers=hdr)
response = urllib2.urlopen(req)
data = json.load(response.read())
# Print the values
print 'Name:', data['response']['employers'][0]['name']
print 'Industry:', data['response']['employers'][0]['industry']
import json, urlib2
url = "http..."
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(url, headers=hdr)
response = urllib2.urlopen(req)
data = json.loads(response.read())
# Print the values
print 'numberOfRatings:', data['response']['employers'][0]['numberOfRatings']

Categories

Resources