Scraping with requests - python

what is wrong in my code, I try get the same content like in https://koleo.pl/rozklad-pkp/krakow-glowny/radom/19-03-2019_10:00/all/EIP-IC--EIC-EIP-IC-KM-REG but result is diffrent as I want to have.
import requests
from bs4 import BeautifulSoup
s = requests.Session()
s.headers.update({"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'})
response=s.get('https://koleo.pl/rozklad-pkp/krakow-glowny/radom/19-03-
2019_10:00/all/EIP-IC--EIC-EIP-IC-KM-REG')
soup=BeautifulSoup(response.text,'lxml')
print(soup.prettify())

You can use requests and pass params in to get json for the train info and prices. I haven't parsed out all the info as this is just to show you it is possible. I parse out the train ids to be able to make the subsequent requests from price info which are linked by ids to the train info
import requests
from bs4 import BeautifulSoup as bs
url = 'https://koleo.pl/pl/connections/?'
headers = {
'Accept' : 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding' : 'gzip, deflate, br',
'Accept-Language' : 'en-US,en;q=0.9',
'Connection' : 'keep-alive',
'Cookie' : '_ga=GA1.2.2048035736.1553000429; _gid=GA1.2.600745193.1553000429; _gat=1; _koleo_session=bkN4dWRrZGx0UnkyZ3hjMWpFNGhiS1I3TzhQMGNyWitvZlZ0QVRUVVVtWUFPMUwxL0hJYWJyYnlGTUdHYXNuL1N6QlhHMHlRZFM3eFZFcjRuK3ZubllmMjdSaU5CMWRBSTFOc1JRc2lDUGV0Y2NtTjRzbzZEd0laZWI1bjJoK1UrYnc5NWNzZzNJdXVtUlpnVE15QnRnPT0tLTc1YzV1Q2xoRHF4VFpWWTdWZDJXUnc9PQ%3D%3D--3b5fe9bb7b0ce5960bc5bd6a00bf405df87f8bd4',
'Host' : 'koleo.pl',
'Referer' : 'https://koleo.pl/rozklad-pkp/krakow-glowny/radom/19-03-2019_10:00/all/EIP-IC--EIC-EIP-IC-KM-REG',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
'X-CSRF-Token' : 'heag3Y5/fh0hyOfgdmSGJBmdJR3Perle2vJI0VjB81KClATLsJxFAO4SO9bY6Ag8h6IkpFieW1mtZbD4mga7ZQ==',
'X-Requested-With' : 'XMLHttpRequest'
}
params = {
'v' : 'a0dec240d8d016fbfca9b552898aba9c38fc19d5',
'query[date]' : '19-03-2019 10:00:00',
'query[start_station]' : 'krakow-glowny',
'query[end_station]': 'radom',
'query[brand_ids][]' : '29',
'query[brand_ids][]' : '28',
'query[only_direct]' : 'false',
'query[only_purchasable]': 'false'
}
with requests.Session() as s:
data= s.get(url, params = params, headers = headers).json()
print(data)
priceUrl = 'https://koleo.pl/pl/prices/{}?v=a0dec240d8d016fbfca9b552898aba9c38fc19d5'
for item in data['connections']:
r = s.get(priceUrl.format(item['id'])).json()
print(r)

You have to use selenium in order to get that dynamically generated content. And then you can parse html with BS. For example I've parsed dates:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox()
driver.get('https://koleo.pl/rozklad-pkp/krakow-glowny/radom/19-03-2019_10:00/all/EIP-IC--EIC-EIP-IC-KM-REG')
soup = BeautifulSoup(driver.page_source, 'lxml')
for div in soup.findAll("div", {"class": 'date custom-panel'}):
date = div.findAll("div", {"class": 'row'})[0].string.strip()
print(date)
Output:
wtorek, 19 marca
środa, 20 marca

Related

Selecting links within a div tag using beautiful soup

I am trying to run the following code
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
params = {
'q': 'Machine learning,
'hl': 'en'
}
html = requests.get('https://scholar.google.com/scholar', headers=headers,
params=params).text
soup = BeautifulSoup(html, 'lxml')
for result in soup.select('.gs_r.gs_or.gs_scl'):
profiles=result.select('.gs_a a')['href']
The following output (error) is being shown
"TypeError: list indices must be integers or slices, not str"
What is it I am doing wrong?
The following is tested and works:
import requests
from bs4 import BeautifulSoup as bs
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
params = {
'q': 'Machine learning',
'hl': 'en'
}
html = requests.get('https://scholar.google.com/scholar', headers=headers,
params=params).text
soup = bs(html, 'lxml')
for result in soup.select('.gs_r.gs_or.gs_scl'):
profiles=result.select('.gs_a a')
for p in profiles:
print(p.get('href'))
Result in terminal:
/citations?user=rSVIHasAAAAJ&hl=en&oi=sra
/citations?user=MnfzuPYAAAAJ&hl=en&oi=sra
/citations?user=09kJn28AAAAJ&hl=en&oi=sra
/citations?user=yxUduqMAAAAJ&hl=en&oi=sra
/citations?user=MnfzuPYAAAAJ&hl=en&oi=sra
/citations?user=9Vdfc2sAAAAJ&hl=en&oi=sra
/citations?user=lXYKgiYAAAAJ&hl=en&oi=sra
/citations?user=xzss3t0AAAAJ&hl=en&oi=sra
/citations?user=BFdcm_gAAAAJ&hl=en&oi=sra
/citations?user=okf5bmQAAAAJ&hl=en&oi=sra
/citations?user=09kJn28AAAAJ&hl=en&oi=sra
In your code, you were trying to obtain the href attribute from a list (soup.select returns a list, and soup.select_one return a single element).
See BeautifulSoup documentation here

Supreme adding to cart with Python Requests

So I'm currently trying to cart an item on supreme through requests. After posting the carting request I don't get any errors but just that it didn't work as a response.
#imports
import requests
from bs4 import BeautifulSoup
#constants
baseurl = "https://www.supremenewyork.com/"
product_category = ""
size = ["Medium"]
product_keywords = ["Supreme®/The North Face® Steep Tech Fleece Pant"]
product_style = ["Brown"]
#functions
def carting(url):
session = requests.Session()
r = session.get(baseurl+url)
soup = BeautifulSoup(r.text, "html.parser")
name = soup.find("h1", {"itemprop" : "name"}).text
style = soup.find("p", {"itemprop" : "model"}).text
for keyword in product_keywords:
if keyword in name:
for keyword in product_style:
if keyword in style:
print("Product Found! Adding to cart...")
form = soup.find("form", {"id" : "cart-add"})
payload = {
"utf8" : "✓",
"authenticity_token" : form.find("input", {"name" : "authenticity_token"})["value"],
"style" : form.find("input", {"name" : "style"})["value"],
"size" : "92001", #form.find("select", {"id" : "size"})["value"] Need to rework getting Size through Keyword
"qty" : "1"
}
headers = {
"user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
"origin" : "https://www.supremenewyork.com",
"referer" : baseurl + url,
"path" : form["action"],
'Host': 'www.supremenewyork.com',
'Accept': 'application/json',
'Proxy-Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-us',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'http://www.supremenewyork.com',
'Connection': 'keep-alive',
}
response = session.post(baseurl+form["action"], data=payload)
print(response.text)
return(session)
carting("/shop/pants/mj1czv0pa/jcyp91a8w")
The answer I get printed is
Product Found! Adding to cart...
{"cart":[],"success":false}
I wondered if I just have to gamble on which of all the headers I have to include so it works since maybe the site wants to have a certain or maybe all headers included that would be sent manually.
Help appreciated!

How to change Json data output in table format

import requests
from pprint import pprint
import pandas as pd
baseurl = "https://www.nseindia.com/"
url = f'https://www.nseindia.com/api/live-analysis-oi-spurts-underlyings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
'like Gecko) '
'Chrome/80.0.3987.149 Safari/537.36',
'accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'}
session = requests.Session()
request = session.get(baseurl, headers=headers, timeout=30)
cookies = dict(request.cookies)
res = session.get(url, headers=headers, timeout=30, cookies=cookies)
print(res.json())
I tried df = pd.DataFrame(res.json()) but couldn't get data in table format. How to do that Plz. Also how to select few particular columns only in data output instead of all columns.
Try this :
import json
import codecs
df = pd.DataFrame(json.loads(codecs.decode(bytes(res.text, 'utf-8'), 'utf-8-sig'))['data'])
And to select a specific columns, you can use :
mini_df = df[['symbol', 'latestOI', 'prevOI', 'changeInOI', 'avgInOI']]
>>> print(mini_df)

Parsing a table with Pandas

I am trying to parse the table from https://alreits.com/screener
I have tried this:
main_url = 'https://alreits.com/screener'
r = requests.get(main_url)
df_list = pd.read_html(r.text)
df = df_list[0]
print(df)
but pandas cant find the table.
I have also tried using BeautifulSoup4 but it didnt seem to give better results.
This is the selector: #__next > div.MuiContainer-root.MuiContainer-maxWidthLg > div.MuiBox-root.jss9.Card__CardContainer-feksr6-0.fpbzHQ.ScreenerTable__CardContainer-sc-1c5wxgl-0.GRrTj > div > table > tbody
This is the full xPath: /html/body/div/div[2]/div[2]/div/table/tbody
I am trying to get the Stock symbol (under name),sector,score and market cap. The other data would be nice to have but is not necessary.
Thank You!
I found one JSON url from the dev tool. This is an easy way to extract the table instead of using selenium. Use post request to extract the data.
import requests
headers = {
'authority': 'api.alreits.com:8080',
'sec-ch-ua': '"Google Chrome";v="93", " Not;A Brand";v="99", "Chromium";v="93"',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
'sec-ch-ua-platform': '"Windows"',
'content-type': 'application/json',
'accept': '*/*',
'origin': 'https://alreits.com',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://alreits.com/',
'accept-language': 'en-US,en;q=0.9',
}
params = (
('page', '0'),
('size', '500'),
('sort', ['marketCap,desc', 'score,desc', 'ffoGrowth,desc']),
)
data = '{"filters":[]}'
response = requests.post('https://api.alreits.com:8080/api/reits/screener', headers=headers, params=params, data=data)
df = pd.DataFrame(response.json())
The code below will return the data you are looking for.
import requests
import pprint
import json
headers = {'content-type': 'application/json',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
r = requests.post(
'https://api.alreits.com:8080/api/reits/screener?page=0&size=500&sort=marketCap,desc&sort=score,desc&sort=ffoGrowth,desc',
headers=headers, data=json.dumps({'filters':[]}))
if r.status_code == 200:
pprint.pprint(r.json())
# Now you have the data - do what you want with it
else:
print(r.status_code)

Trouble with webscraping

I would like to have the name of the hotel, usually I have no problem with this kind of scraping but here it doesn't work I don't understand.
Here's my script:
import numpy as np
import time
from random import randint
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import random
#headers= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
'Referer': 'https://www.espncricinfo.com/',
'Upgrade-Insecure-Requests': '1',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
url = 'https://www.booking.com/hotel/fr/hyatt-regency-paris-etoile.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1619708145;srpvid=6f6268f8305e011d;type=total;ucfs=1&#hotelTmpl'
results = requests.get(url, headers = headers)
soup = BeautifulSoup(results.text, "html.parser")
hotel = soup.find('h2', class_ = 'hp__hotel-name').text
print(hotel)
Here's the error :
Traceback (most recent call last):
File "test_booking_info_supp.py", line 75, in <module>
hotel = soup.find('h2', class_ = 'hp__hotel-name').text
AttributeError: 'NoneType' object has no attribute 'text'
I cannot understand why I obtained None, here's the html:
The link to the website is in the image
You can try out with id attribute from tag to get the text value
url = 'https://www.booking.com/hotel/fr/hyatt-regency-paris-etoile.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1619708145;srpvid=6f6268f8305e011d;type=total;ucfs=1&#hotelTmpl'
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
hotel = soup.find("h2",attrs={"id":"hp_hotel_name"})
print(hotel.text.strip("\n").split("\n")[1])
Output:
'Hyatt Regency Paris Etoile'

Categories

Resources