How to make fetch request and get data from object in python? - python

Can someone help me please. I am doing fetch request and trying to get data using python. But I am getting an error.
import requests
import json
response_API = requests.get('https://newsapi.org/v2/top-headlines?q=sports&country=ru&pageSize=10&apiKey=befce9fd53c04bb695e30568399296c0')
print(response_API.status_code)
data=response_API.text
parse_json=json.loads(response_API)
active_case=parse_json['name']
print('Total results',active_case)
I'm trying to get the name from the following array:
{"status":"ok","totalResults":2,"articles":[{"source":{"id":null,"**name**":"Sports.ru"},"author":"Валерий Левкин","title":"Леброн Джеймс получил «Золотую малину» за худшую актерскую работу - Sports.ru","description":"В США названы обладатели антинаграды «Золотая малина» по итогам 2021 года.","url":"https://www.sports.ru/basketball/1107870293-lebron-dzhejms-poluchil-zolotuyu-malinu-za-xudshuyu-akterskuyu-rabotu.html%22,%22urlToImage%22:%22https://www.sports.ru/dynamic_images/news/110/787/029/3/share/bd571e.jpg%22,%22publishedAt%22:%222022-03-26T13:03:00Z%22,%22content":null}]}
Got error, value is not returned.

The newsapi URL returns JSON content with a list of articles, where each article has this structure:
{
"source": {
"id": null,
"name": "Sports.ru"
},
"author": "...",
"title": "... - Sports.ru",
"description": "...",
"url": "https://www.sports.ru/basketball/1107870293-lebron-dzhejms-poluchil-zolotuyu-malinu-za-xudshuyu-akterskuyu-rabotu.html",
"urlToImage": "https://www.sports.ru/dynamic_images/news/110/787/029/3/share/bd571e.jpg",
"publishedAt": "2022-03-26T13:03:00Z",
"content": null
}
To extract a particular element such as description from each article then try this:
import requests
import json
response = requests.get('https://newsapi.org/v2/top-headlines?q=sports&country=ru&pageSize=10&apiKey=befce9fd53c04bb695e30568399296c0')
print(response.status_code)
response.encoding = "utf-8"
data = response.json()
# to get the name from source of each article
print([article["source"].get("name") for article in data["articles"]])
# to get the descriptions from each article
# where desc will be a list of descriptions
desc = [article["description"] for article in data["articles"]]
print(desc)
Output:
200
['Sports.ru', 'Sports.ru']
['description1', 'description2']

You need to follow the nesting of objects:
First get the key 'articles'
Then get the first element of the list
Then get the key 'source'
Finally get the key 'name'.
You can do this all in a single line with indexes.

slightly different method, but same result using your original technique as the basis. You get a json string, then convert that to json, then search for the bit that you want.
import requests
import json
response_API = requests.get('https://newsapi.org/v2/top-headlines?q=sports&country=ru&pageSize=10&apiKey=befce9fd53c04bb695e30568399296c0')
print(response_API.status_code)
# this is a json string
data=response_API.text
# convert string to json
parse_json=json.loads(data)
print('here is the json....')
print(parse_json)
# get an element form json
active_case=parse_json['articles'][0]
# print the result
print('here is the active case...')
print(active_case)
This is the result, from which you can extract what you like from it:
{'source': {'id': None, 'name': 'Sports.ru'}, 'author': 'Валерий Левкин', 'title': 'Леброн Джеймс получил «Золотую малину» за худшую актерскую работу - Sports.ru', 'description': 'В США названы обладатели антинаграды «Золотая малина» по итогам 2021 года.', 'url': 'https://www.sports.ru/basketball/1107870293-lebron-dzhejms-poluchil-zolotuyu-malinu-za-xudshuyu-akterskuyu-rabotu.html', 'urlToImage': 'https://www.sports.ru/dynamic_images/news/110/787/029/3/share/bd571e.jpg', 'publishedAt': '2022-03-26T13:03:00Z', 'content': None}, {'source': {'id': None, 'name': 'Sports.ru'}, 'author': 'Андрей Карнаухов', 'title': 'Овечкин забил 771-й гол в НХЛ. До Хоу – 30 шайб - Sports.ru', 'description': 'Капитан\xa0«Вашингтона»\xa0Александр Овечкин\xa0забросил\xa0шайбу, а также забил победный буллит в серии в матче с «Баффало» (4:3 Б) и был признан третьей звездой.', 'url': 'https://www.sports.ru/hockey/1107860736-ovechkin-zabil-771-j-gol-v-nxl-do-xou-30-shajb.html', 'urlToImage': 'https://www.sports.ru/dynamic_images/news/110/786/073/6/share/c9cb18.jpg', 'publishedAt': '2022-03-26T01:56:15Z', 'content': None}
Here the result is a simple dict.

Related

How to fix this error during scraping using BeautifulSoup?

I am trying to do web scraping using BeautifulSoup and requests Python library. I want to filter the news titles from Hacker News website but its showing an error while implementing.
import requests
from bs4 import BeautifulSoup
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titleline a')
subtext = soup.select('.subtext')
def create_custom_hn(links, subtext):
hn = []
for index, item in enumerate(links):
title = links[index].getText()
href = links[index].get('href', None)
votes = subtext[index].select('.score')
if len(votes):
points = int(votes[0].getText().replace(' points', ''))
print(points)
hn.append({'title': title, 'href': href})
return hn
print(create_custom_hn(links, subtext))
The error says
votes = subtext[index].select('.score')
~~~~~~~^^^^^^^
IndexError: list index out of range
Here is fixed version of the code from the question:
import requests
from bs4 import BeautifulSoup
res = requests.get("https://news.ycombinator.com/news")
soup = BeautifulSoup(res.text, "html.parser")
links = soup.select(".titleline > a")
def create_custom_hn(links):
hn = []
for link in links:
title = link.getText()
href = link.get("href", None)
votes = link.find_next(class_="score")
points = int(votes.getText().replace(" points", ""))
hn.append({"title": title, "href": href, "points": points})
return hn
print(create_custom_hn(links))
Prints:
[
{
"title": "Urllib3 in 2022",
"href": "https://sethmlarson.dev/urllib3-in-2022",
"points": 97,
},
{
"title": "First public release of Pushup: a new compiler for making web apps in Go",
"href": "https://github.com/adhocteam/pushup",
"points": 18,
},
{
"title": "Intelligence – A good collection of great OSINT Resources",
"href": "https://github.com/ARPSyndicate/awesome-intelligence",
"points": 113,
},
{
"title": "Microsoft is preparing to add ChatGPT to Bing",
"href": "https://www.bloomberg.com/news/articles/2023-01-04/microsoft-hopes-openai-s-chatbot-will-make-bing-smarter",
"points": 760,
},
...and so on.
Try to select your elements more specific, your selection of soup.select('.titleline a') includes more elements (60) as you may like to select (30):
[Urllib3 in 2022,
<span class="sitestr">sethmlarson.dev</span>,...]
I would also recommend to iterate the elements in another way, so you would become able to handle missing values.
Example
import requests
from bs4 import BeautifulSoup
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text)
data = []
for e in soup.select('tr.athing'):
data.append({
'title':e.select_one('.titleline a').get_text(),
'url':e.select_one('.titleline a').get('href'),
'votes':e.find_next(class_='subtext').text.split()[0]
})
print(data)
Output
[{'title': 'Urllib3 in 2022', 'url': 'https://sethmlarson.dev/urllib3-in-2022', 'votes': '93'}, {'title': 'First public release of Pushup: a new compiler for making web apps in Go', 'url': 'https://github.com/adhocteam/pushup', 'votes': '16'}, {'title': 'Intelligence – A good collection of great OSINT Resources', 'url': 'https://github.com/ARPSyndicate/awesome-intelligence', 'votes': '109'}, {'title': 'Microsoft is preparing to add ChatGPT to Bing', 'url': 'https://www.bloomberg.com/news/articles/2023-01-04/microsoft-hopes-openai-s-chatbot-will-make-bing-smarter', 'votes': '755'}, {'title': 'Juan Tamariz, the godfather of close-up card magic', 'url': 'https://www.nytimes.com/2023/01/02/magazine/juan-tamariz-magic.html', 'votes': '31'}, {'title': 'The Expanding Dark Forest and Generative AI', 'url': 'https://maggieappleton.com/ai-dark-forest', 'votes': '223'}, {'title': 'Irreconcilable differences between local and distributed computing (1994)', 'url': 'https://scholar.harvard.edu/waldo/publications/note-distributed-computing', 'votes': '29'},...]

Beautiful soup - html parser returns dots instead of string visible on web

I'm trying to get the number of actors from: https://apify.com/store which is under the following HTML:
<div class="ActorStore-statusNbHits">
<span class="ActorStore-statusNbHitsNumber">895</span>results</div>
When I send get request and parse response with BeautifulSoup using:
r = requests.get(base_url)
soup = BeautifulSoup(r.text, "html.parser")
return soup.find("span", class_="ActorStore-statusNbHitsNumber").text
I get three dots ... instead of the number 895
the element is <span class="ActorStore-statusNbHitsNumber">...</span>
How can I get the number?
If you inspect the network calls in your browser (press F12) and filter by XHR, you'll see that the data is loaded dynamically via sending a POST request:
You can mimic that request via sending the correct json data. There's no need for BeautifulSoup you can use only the requests module.
Here is a complete working example:
import requests
data = {
"query": "",
"page": 0,
"hitsPerPage": 24,
"restrictSearchableAttributes": [],
"attributesToHighlight": [],
"attributesToRetrieve": [
"title",
"name",
"username",
"userFullName",
"stats",
"description",
"pictureUrl",
"userPictureUrl",
"notice",
"currentPricingInfo",
],
}
response = requests.post(
"https://ow0o5i3qo7-dsn.algolia.net/1/indexes/prod_PUBLIC_STORE/query?x-algolia-agent=Algolia%20for%20JavaScript%20(4.12.1)%3B%20Browser%20(lite)&x-algolia-api-key=0ecccd09f50396a4dbbe5dbfb17f4525&x-algolia-application-id=OW0O5I3QO7",
json=data,
)
print(response.json()["nbHits"])
Output:
895
To view all the JSON data in order to access the key/value pairs, you can use:
from pprint import pprint
pprint(response.json(), indent=4)
Partial output:
{ 'exhaustiveNbHits': True,
'exhaustiveTypo': True,
'hits': [ { 'currentPricingInfo': None,
'description': 'Crawls arbitrary websites using the Chrome '
'browser and extracts data from pages using '
'a provided JavaScript code. The actor '
'supports both recursive crawling and lists '
'of URLs and automatically manages '
'concurrency for maximum performance. This '
"is Apify's basic tool for web crawling and "
'scraping.',
'name': 'web-scraper',
'objectID': 'moJRLRc85AitArpNN',
'pictureUrl': 'https://apify-image-uploads-prod.s3.amazonaws.com/moJRLRc85AitArpNN/Zn8vbWTika7anCQMn-SD-02-02.png',
'stats': { 'lastRunStartedAt': '2022-03-06T21:57:00.831Z',
'totalBuilds': 104,
'totalMetamorphs': 102660,
'totalRuns': 68036112,
'totalUsers': 23492,
'totalUsers30Days': 1726,
'totalUsers7Days': 964,
'totalUsers90Days': 3205},

Parse server payload with few keys absent

I have a rather basic bit of code. Basically what it does is sends an API request to a locally hosted Server and returns a JSON string. I'm taking that string and cracking it apart. Then I take what I need from it, make a Dictionary, and export it as an XML file with an nfo extension.
The issue is sometimes there are missing bits to the source data. Season is missing fairly frequently for example. It breaks the Data Mapping. I need a way to handle that. For somethings I may want to exclude the data and for others I need a sane default value.
#!/bin/env python
import os
import requests
import re
import json
import dicttoxml
import xml.dom.minidom
from xml.dom.minidom import parseString
# Grab Shoko Auth Key
apiheaders = {
'Content-Type': 'application/json',
'Accept': 'application/json',
}
apidata = '{"user": "Default", "pass": "", "device": "CLI"}'
r = requests.post('http://192.168.254.100:8111/api/auth',
headers=apiheaders, data=apidata)
key = json.loads(r.text)['apikey']
# Grabbing Episode Data
EpisodeHeaders = {
'accept': 'text/plain',
'apikey': key
}
EpisodeParams = (
('filename',
"FILE HERE"),
('pic', '1'),
)
fileinfo = requests.get(
'http://192.168.254.100:8111/api/ep/getbyfilename', headers=EpisodeHeaders, params=EpisodeParams)
# Mapping Data from Shoko to Jellyfin NFO
string = json.loads(fileinfo.text)
print(string)
eplot = json.loads(fileinfo.text)['summary']
etitle = json.loads(fileinfo.text)['name']
eyear = json.loads(fileinfo.text)['year']
episode = json.loads(fileinfo.text)['epnumber']
season = json.loads(fileinfo.text)['season']
aid = json.loads(fileinfo.text)['aid']
seasonnum = season.split('x')
# Create Dictionary From Mapped Data
show = {
"plot": eplot,
"title": etitle,
"year": eyear,
"episode": episode,
"season": seasonnum[0],
}
Here is some example output when the code crashes
{'type': 'ep', 'eptype': 'Credits', 'epnumber': 1, 'aid': 10713, 'eid': 167848,
'id': 95272, 'name': 'Opening', 'summary': 'Episode Overview not Available',
'year': '2014', 'air': '2014-11-23', 'rating': '10.00', 'votes': '1',
'art': {'fanart': [{'url': '/api/v2/image/support/plex_404.png'}],
'thumb': [{'url': '/api/v2/image/support/plex_404.png'}]}}
Traceback (most recent call last):
File "/home/fletcher/Documents/Shoko-Jellyfin-NFO/Xml3.py", line 48, in <module>
season = json.loads(fileinfo.text)['season']
KeyError: 'season'
The solution based on what Mahori suggested. Worked perfectly.
eplot = json.loads(fileinfo.text).get('summary', None)
etitle = json.loads(fileinfo.text).get('name', None)
eyear = json.loads(fileinfo.text).get('year', None)
episode = json.loads(fileinfo.text).get('epnumber', None)
season = json.loads(fileinfo.text).get('season', '1x1')
aid = json.loads(fileinfo.text).get('aid', None)
This is fairly common scenario with web development, where you cannot always assume other party will send all keys.
The standard way to get around this is by using get instead of named fetch.
season = json.loads(fileinfo.text).get('season', None)
#you can change None to any default value here

EBAY Finding API Date Filtering

I am trying to return a list of completed items in a given category using the ebay API. My code seems to be working however the results seem to be very limited (about 100). I was assuming there would be some limitation on how far back the api would go but even just a few days should return thousands of results for this category. Am I missing something in the code or is this just a limitation of the ebay API? I did make sure I was using production and not the sandbox.
So I have realized now that there are multiple pages to my query up to the 100 item / 100 page max. I am now running into issues with the date filtering. I see the filter reference material on site but I am still not getting the result I expect. In the updated query I am trying to pull only items completed yesterday but when running I am getting stuff from today. Is there a better way to input the date filters?
from ebaysdk.finding import Connection as finding
from bs4 import BeautifulSoup
import os
import csv
api = finding(appid=<my appid>,config_file=None)
response = api.execute(
'findCompletedItems', {
'categoryId': '214',
'keywords' : 'prizm',
'endTimeFrom' : '2020-02-03T00:00:00.000Z',
'endTimeTo' : '2020-02-04T00:00:00.000Z' ,
'paginationInput': {
'entriesPerPage': '100',
'pageNumber': '1'
},
'sortOrder': 'EndTimeSoonest'
}
)
soup = BeautifulSoup(response.content , 'lxml')
totalitems = int(soup.find('totalentries').text)
items = soup.find_all('item')
for item in response.reply.searchResult.item:
print(item.itemId)
print(item.listingInfo.endTime)
I finally figured this out. I needed to add additional code for the item filters. The working code is below.
from ebaysdk.finding import Connection as finding
from bs4 import BeautifulSoup
import os
import csv
api = finding(appid=<my appid>,config_file=None)
response = api.execute(
'findCompletedItems', {
'categoryId': '214',
'keywords' : 'prizm',
'itemFilter': [
{'name': 'EndTimeFrom', 'value': '2020-02-03T00:00:00.000Z'},
{'name': 'EndTimeTo', 'value': '2020-02-04T00:00:00.000Z'}
#{'name': 'MinPrice', 'value': '200', 'paramName': 'Currency', 'paramValue': 'GBP'},
#{'name': 'MaxPrice', 'value': '400', 'paramName': 'Currency', 'paramValue': 'GBP'}
],
'paginationInput': {
'entriesPerPage': '100',
'pageNumber': '100'
},
'sortOrder': 'EndTimeSoonest'
}
)
soup = BeautifulSoup(response.content , 'lxml')
totalitems = int(soup.find('totalentries').text)
items = soup.find_all('item')
for item in response.reply.searchResult.item:
print(item.itemId)
print(item.listingInfo.endTime)

How to get ID comment reply with same ID, same class with Parent Comment ID (Using Selenium scraper data)

I have a structure:
[
# If it is a comment (parent comment)
{
'commentParentId': '',
'parentId': '',
'posted': '28/02/2019',
'author': {
'id': '125379',
'name': 'david',
},
'content': 'i need help'
},
# If it is a comment reply
{
'commentParentId': 'abcdedf',
'parentId': '253654',
'posted': '28/02/2019',
'author': {
'id': '458216',
'name': 'david',
},
'content': 'i need help'
},
........................
}]
I want to scrape comment and comment replies,
If it is a comment: CommentParentIDand ParentID are null.
Else, it is a comment reply: CommentParentID and ParentID will take ID from comment which someone replied.
I am scraping comments using Selenium, like this:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
from selenium import webdriver
# Execute Web link
url = "https://genvita.vn/thu-thach/7-ngay-detox-da-dep-dang-thon-nguoi-
khoe-qua-soc-len-den-8-trieu-dong"
driver_path = ('F:/chromedriver.exe')
browser = webdriver.Chrome(executable_path=driver_path)
browser.get(url)
confirm_write = input("Input ok to scrape data: ")
# I want to load all comments (click 'Xem Thêm' then data was
# scraper)
if confirm_write == 'ok':
getID = browser.find_element_by_css_selector("div[class='media-body-
replies']")
getChildID = getID.find_elements_by_css_selector('data-comment-id')
# Get ID
for childID in getChildID:
print(childID.get_attribute('data-comment-id'))
But my code is not working.
Comment and comment reply have same class, same id, just difference is between comment and comment reply is the class : class ='media-body-replies'.
But I am using this and it is not working.
If I use getChildID = browser.find_elements_by_css_selector('data-comment-id')
I will get all ID's of parentID and replyID (similar with content), I cannot separate between comment and comment reply.
Thank You

Categories

Resources