I am accessing a URL https://streeteasy.com/sales/all which does not show the page unless Cookie is set. I am having no idea how this cookie value being generated. I highly doubt that cookie value is fixed so I guess I can't use a hard-coded Cookie value either.
Code below:
import requests
from bs4 import BeautifulSoup
headers = {
'authority': 'streeteasy.com',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'referer': 'https://streeteasy.com/sales/all',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,ur;q=0.8',
'cookie': 'D_SID=103.228.157.1:Bl5GGXCWIxq4AopS1Hkr7nkveq1nlhWXlD3PMrssGpU; _se_t=0944dfa5-bfb4-4085-812e-fa54d44acc54; google_one_tap=0; D_IID=AFB68ACC-B276-36C0-8718-13AB09A55E51; D_UID=23BA0A61-D0DF-383D-88A9-8CF65634135F; D_ZID=C0263FA4-96BF-3071-8318-56839798C38D; D_ZUID=C2322D79-7BDB-3E32-8620-059B1D352789; D_HID=CE522333-8B7B-3D76-B45A-731EB750DF4D; last_search_tab=sales; se%3Asearch%3Asales%3Astate=%7C%7C%7C%7C; streeteasy_site=nyc; se_rs=123%2C1029856%2C123%2C1172313%2C2815; se%3Asearch%3Ashared%3Astate=102%7C%7C%7C%7Cfalse; anon_searcher_stage=initial; se_login_trigger=4; se%3Abig_banner%3Asearch=%7B%22123%22%3A2%7D; se%3Abig_banner%3Ashown=true; se_lsa=2019-07-08+04%3A01%3A30+-0400; _ses=BAh7DEkiD3Nlc3Npb25faWQGOgZFVEkiJWRiODVjZTA1NmYzMzZkMzZiYmU4YTk4Yjk5YmU5ZTBlBjsAVEkiEG5ld192aXNpdG9yBjsARlRJIhFsYXN0X3NlY3Rpb24GOwBGSSIKc2FsZXMGOwBUSSIQX2NzcmZfdG9rZW4GOwBGSSIxbTM5eGRPUVhLeGYrQU1jcjZIdi81ajVFWmYzQWFSQmhxZThNcG92cWxVdz0GOwBGSSIIcGlzBjsARmkUSSIOdXNlcl9kYXRhBjsARnsQOhBzYWxlc19vcmRlckkiD3ByaWNlX2Rlc2MGOwBUOhJyZW50YWxzX29yZGVySSIPcHJpY2VfZGVzYwY7AFQ6EGluX2NvbnRyYWN0RjoNaGlkZV9tYXBGOhJzaG93X2xpc3RpbmdzRjoSbW9ydGdhZ2VfdGVybWkjOhltb3J0Z2FnZV9kb3ducGF5bWVudGkZOiFtb3J0Z2FnZV9kb3ducGF5bWVudF9kb2xsYXJzaQJQwzoSbW9ydGdhZ2VfcmF0ZWYJNC4wNToTbGlzdGluZ3Nfb3JkZXJJIhBsaXN0ZWRfZGVzYwY7AFQ6EHNlYXJjaF92aWV3SSIMZGV0YWlscwY7AFRJIhBsYXN0X3NlYXJjaAY7AEZpAXs%3D--d869dc53b8165c9f9e77233e78c568f610994ba7',
}
session = requests.Session()
response = session.get('https://streeteasy.com/for-sale/downtown', headers=headers, timeout=20)
if response.status_code == 200:
html = response.text
soup = BeautifulSoup(html, 'lxml')
links = soup.select('h3 > a')
print(links)
Related
I am having trouble reading data from a url with BeautifulSoup
This is my code:
url1 = "https://www.barstoolsportsbook.com/events/1018282280"
from bs4 import BeautifulSoup
import requests
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
html = requests.get(url1, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
data = soup.findAll('div',attrs={"class":"section"})
print(data)
#for x in data:
# print(x.find('p').text)
When I print(data) I am returned []. What could be the reason for this? I would like to avoid using selenium for this task if possible.
This is the HTML for what I'm trying to grab
<div data-v-50e01018="" data-v-2a52296d="" class="section"><p data-v-5f665d29="" data-v-50e01018="" class="header strongbody2">HOT TIPS</p><p data-v-5f665d29="" data-v-50e01018="" class="tip body2"> The Mets have led after 3 innings in seven of their last nine night games against NL East Division opponents that held a losing record. </p><p data-v-5f665d29="" data-v-50e01018="" class="tip body2"> The 'Inning 1 UNDER 0.5 runs' market has hit in each of the Marlins' last nine games against NL East opponents. </p></div>
You can likely get what you want with this request:
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:103.0) Gecko/20100101 Firefox/103.0',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'en-US,en;q=0.5',
'Authorization': 'Basic MTI2OWJlMjItNDI2My01MTI1LWJlNzMtMDZmMjlmMmZjNWM3Omk5Zm9jajRJQkZwMUJjVUc0NGt2S2ZpWEpremVKZVpZ',
'Origin': 'https://www.barstoolsportsbook.com',
'DNT': '1',
'Connection': 'keep-alive',
'Referer': 'https://www.barstoolsportsbook.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'cross-site',
}
response = requests.get('https://api.isportgenius.com.au/preview/1018282280', headers=headers)
response.json()
You should browse the network tab to see where the rest of the data is coming from, or use a webdriver.
I know you can loop through page numbers in a URL but is it possible loop through pages in a Payload? I would guess you need beautiful soup? At the end of the Payload, there is this code.
page=1&start=0&limit=250
Currently, I am just getting page 1 with 250 lines. I need to loop through the code and retrieve page=2&start=251&limit=250 and the subsequent 11 pages. Could anyone show me how to do this?
Working Code for first 250 Lines
import requests
import pandas as pd
def stock_data(stock_info):
data = pd.DataFrame(stock_info)
data = data.iloc[:, 4:]
data.to_csv("data.csv", index=False)
url = "https://www.stockrover.com/stock_infos/grid?_dc=1644876887410"
payload = "state=%7B%22sortInfo%22%3A%7B%7D%2C%22columns%22%3A%5B76%2C77%2C50%2C80%2C547%2C13%2C21%2C771%5D%2C%22view%22%3A313%2C%22priorPrimaryColumn%22%3A170%2C%22filterData%22%3A%5B%5D%2C%22name%22%3A%22New%201%22%2C%22cType%22%3A%22Screener%22%2C%22cNode%22%3A%22s_42%22%2C%22cIsFolder%22%3Afalse%2C%22gridSelection%22%3A%22ANDE%22%2C%22lastActive%22%3A1396898415%2C%22primaryColumn%22%3A76%2C%22folderDisabledParams%22%3A%7B%22filterData%22%3A%5B%5D%7D%2C%22mainGridDateRange%22%3A%22ytd%22%2C%22groupState%22%3Anull%2C%22moversGridDateRange%22%3A%221_day%22%2C%22peersGridDateRange%22%3A%221_day%22%2C%22lastGridSelections%22%3A%5B%22ANDE%22%5D%2C%22lastQuantNode%22%3A%5B%22s_42%22%2C%22s_42%22%5D%2C%22includeQuotesInTable%22%3Afalse%2C%22includeAllQuotesLastValue%22%3Afalse%2C%22markets%22%3A%7B%22panel%22%3A%22summary%22%7D%2C%22researchPanel%22%3A%22comparisonPanel%22%2C%22recentSearchTickers%22%3A%5B%22SPY%22%2C%22AMZN%22%2C%22AAPL%22%2C%22s_32%22%2C%22%5ENDX%22%2C%22AXP%22%2C%22XOM%22%2C%22AFL%22%2C%22%5EDJX%22%2C%22AIT%22%2C%22ADVC%22%5D%2C%22quotesBoxTickers%22%3A%5B%22AMZN%22%2C%22AAPL%22%2C%22SPY%22%5D%2C%22checkedQuotesBoxTickers%22%3A%5B%22AMZN%22%2C%22AAPL%22%2C%22SPY%22%5D%2C%22dashboard%22%3A%7B%22buttonRef%22%3A%22272%22%7D%2C%22tickerSelectedFeeds%22%3A%5B%22Benzinga%20News%22%2C%22Yahoo%20News%22%5D%2C%22marketSelectedFeeds%22%3A%5B%22Google%20News%22%2C%22Stock%20Market%20News%20-%20Investing.com%22%5D%2C%22bondsSelectedFeeds%22%3A%5B%22Bonds%20Strategy%20-%20Investing.com%22%5D%2C%22commoditiesSelectedFeeds%22%3A%5B%22Commodities%20%26%20Futures%20News%20-%20Investing.com%22%2C%22Commodities%20Fundamental%20Analysis%20-%20Investing.com%22%2C%22Commodities%20Strategy%20Analysis%20-%20Investing.com%22%5D%2C%22stocksSelectedFeeds%22%3A%5B%22CNNMoney%20News%22%2C%22Google%20News%22%2C%22Seeking%20Alpha%20Top%20Stories%22%5D%2C%22etfsSelectedFeeds%22%3A%5B%22Economy%20News%20-%20Investing.com%22%2C%22ETF%20Analysis%20-%20Investing.com%22%2C%22Investing%20Ideas%20-%20Investing.com%22%5D%2C%22topPanel%22%3A%22researchPanel%22%2C%22maxRecordsNode%22%3Afalse%2C%22version%22%3A7%2C%22lastGridSelectionsRaw%22%3A%5B%22ANDE%22%5D%2C%22lastSelectionScreeners%22%3A%22s_42%22%2C%22quotesDisabled%22%3Atrue%2C%22lastSelectionPortfolios%22%3A%22p_2%22%2C%22comparisonPanels%22%3A%7B%22Portfolio%22%3A%22p_2%22%2C%22Index%22%3A%22%5EDJX%22%2C%22Watchlist%22%3A%22Watchlists%22%2C%22Screener%22%3A%22s_42%22%7D%2C%22lastSelectionWatchlists%22%3A%22w_26%22%2C%22indicesSelectedFeeds%22%3A%5B%22Google%20News%22%2C%22Yahoo%20News%22%5D%2C%22newsActive%22%3A%22tickerNews%22%2C%22recentSearchMetrics%22%3A%5B%22Price%22%2C%22EPS%22%2C%22Sales%22%5D%2C%22editPanel%22%3A%22positionsPanel%22%2C%22newsType%22%3A%22marketNews%22%2C%22tableColumns%22%3A%5B%22ticker%22%2C%22rank%22%2C%22score_rank%22%2C%22filter_score%22%2C%22company%22%2C%22cash%22%2C%22currentassets%22%2C%22netppe%22%2C%22intangibles%22%2C%22totalassets%22%2C%22currentliabilities%22%2C%22longtermdebt%22%2C%22totaldebt%22%2C%22totalliabilities%22%2C%22equity%22%2C%22tangiblebookvalue%22%2C%22cash_short_term_p%22%2C%22net_ppe_p%22%2C%22intangibles_p%22%5D%2C%22last_save%22%3A1644837064%2C%22panels%22%3A%7B%22collapsed%22%3A%7B%22chp%22%3Atrue%2C%22ip%22%3Atrue%2C%22mp%22%3Afalse%2C%22qp%22%3Afalse%2C%22conp%22%3Atrue%2C%22fsp%22%3Afalse%7D%2C%22viewportWidth%22%3A%221920%22%2C%22viewportHeight%22%3A%221069%22%2C%22chartPanelHeight%22%3A483%2C%22controlPanelWidth%22%3A296%2C%22insightPanelWidth%22%3A%22485%22%2C%22quoteBoxHeight%22%3A200%2C%22navigationPanelWidth%22%3A277%7D%7D&updateMarket=true&page=1&start=0&limit=250"
headers = {
'authority': 'www.stockrover.com',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
'x-csrf-token': 'fAeVScD26lby5MQf5YFI5p3snudo3E+rw0TL0h1W3j/vcjsIMvgxAF5Z9DkMjjCU4trT/b4EV0VCCPvmms5VIw==',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'accept': 'application/json',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-platform': '"Windows"',
'origin': 'https://www.stockrover.com',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.stockrover.com/research/all/313/s_42/ANDE',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'remember_me_pref=0; user_name=test11964; plan=3; premiumBraintreeKey=MIIBCgKCAQEAzM4LJfrNnBOgRFB1dDJkmqTFCWT2Y%2BksOydD8xDH4R033WUzxbffMZb%2B3dqEyQvOVjLcwFIHByDc4Xwej7enas2E%2FVRyh7Cvyadn7M5zQeRyLcI9Ys5KCozMwxJPc0x76FlXPwiAo1Qlz3RcLb9wGHBag2R51FuTie%2BhVDCgzWajqDCREzRhi%2Fqlt3D%2FxXNo%2FiwJlpOUr%2Fx1QnkkILxgKlq1dD7KJ767O5ojYKXsO%2BV2Bfu7sSD3djsOxQJ1%2FRbaDm2E96EDkWhhOeOpPndQ6IuSl4NmnJg%2Fcq6f8csW8M3Ys%2BMZPFkdxPC4%2FfRM1XC9o76PjpVNBIO%2ByJEELKZedwIDAQAB; lr=1644876886; _Ruby2_session=OEs3djBGVmtrSXhSWjFibnp2ck9vem8vd1VmQ00wUkZCVWlaWmYzOFBQQUJyemc0MFNjMlVmTGRUeFpNSTFIRGliVitnb2M1L2JNcCt5SnQxN2xtZDV5M20waEd0elh3aUU3R0k3YnJiVy9BcUhObGpBUU54c2tQRE9RZHBZWk8wa0NBQXJub2tHU3pHZmUvM3dRNGZvVStsT0ZEbUNEa3ZyVDkxdDA5S1B5d3VZWlZNUERxL01VNlYzdHhBci9uSnduOGxodk8rSDJLaUJIWTVyM25GL3o4RHYva3pGeDdIY1NJODV4WkV4MnRiR2RoNHZDUktPWSsxaElPdXNEU0tSaCtKL1o0K2FsSE1rcVI2TkgxOFo0MktSL1JRWjgzRllBZGFhMjg4bitpTUJzQjR6Nk5OZzhzUFMzVGRpVFNZOUxrTnhWNTB2K0dYNXdDcFFXZnpZdlZleFR2cWY5cmJqaDNiY0JhWVJJT0s1TDEvOHU0UTV6NW5uTjcwZjFzdHpxeFg0cVQ3NTRob2xTMlA2ZDhCQT09LS1LOEljN3laVTlBVUpsVVo3c093Y2NnPT0%3D--b963330daa985315420ea5893f1cfa3e3a54c9d5; _Ruby2_session=UmE1NVo0cjJ1YmR3RGlQUis0V3JVWUprR0xtbzVaV0k2NTNhUmFUU2VsNVB4WFN6M0Y5M0xIVmpMdGdBTEgzMDZwb3JZNGpJRmFkdjRyWG9iY3NGT1pENTRaaXdZZit3b3FCZzdvMTVxZ3p2RGpUUUtyTGpOcjhZcXVxS0FIcElBTnhtZDlJQ3g3d1o2bUF3Tzl4NnIyZ2ZHYmMyb09vVnMwaGU5SEsvTWRwaHdFeUNGUDJZUDY4eXEwRlcyYzFqVVh2VzVjcEhDSU9jOUw4NmVJcmd6aHEwVUUya25Yb0Y5d21hY3g5VTdxbi81dkdiZ2Qrc0YrdEZtNWU5c3ZlcHFRSXVqcXdBVEM1RnV5bFo3N3hQNzg4Q1lJWTRtakp2MHJyN3gvUEtvN0h2R3lTZEFqZkwrVFlDRlk2czZoaDBQcXltUjdQbUNiTXJWMW42WnlPdUc2ZGxzUXRzY3JuYTN1V1VIMkVvdHVpeE40N0l6SjVLelJhUGh6aXBrZ3B2V1gySG1YRUVLbGZGYzQzRzR3QjJwSTFieDZRQ242QTdKNlNEWVFIQ3lTWHJPQzUzZ1lKVzl5S0FoZWxHQ0xnTy0tODZDejdtSmFDek5GeUVMMnJweTJtUT09--a126f3bcc5b8af0a5a824e6b674d55f1fe9ee12e; lr=1644876939'
}
for page in range(3):
pld = payload.format(page+1, page*250, 250)
response = requests.request("POST", url, headers=headers, data=pld)
stock_info = response.json()['stock_infos']
stock_data(stock_info)
Here's how you do it in a loop. This works; I've tried it here.
for page in range(3):
pld = payload.format(page+1, page*250, 250)
response = requests.request("POST", url, headers=headers, data=pld)
stock_info = response.json()['stock_infos']
stock_data(stock_info)
You will, of course, need to modify your code so that stock_data doesn't overwrite the CSV file every time. You can either append to one big dataframe, or append to the CSV file.
My request:
# python 3.7.3
import requests
from requests import Session
session = Session()
session.head('https://www.basspro.com/shop/en/blazer-brass-handgun-ammo')
cookies = requests.utils.cookiejar_from_dict(requests.utils.dict_from_cookiejar(session.cookies))
response = session.post(
url='https://www.basspro.com/shop/BPSGetInventoryStatusByIDView',
data={
'productId': '3074457345616736172',
'itemId': '3074457345616736949',
'isGunFlag': 'false',
},
cookies=cookies,
headers={
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'content-length': '72',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://www.basspro.com',
'referer': 'https://www.basspro.com/shop/en/blazer-brass-handgun-ammo',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.92 Safari/537.36 Vivaldi/2.9.1705.38',
'x-requested-with': 'XMLHttpRequest',
},
)
print(response.text)
Output:
<input type="hidden" class="relativeToAbsolute" value="true" />
/*
{
"onlineInventory": {
"status": "Status Not Available",
"image": "widget_product_info/outofstock_icon.svg",
"altText": "Status Not Available",
"isDropShip": false,
"availableDate":""
},
"inStoreInventory": {
"stores": [],
"checkStoreText": "Check Store Availability",
"isInStoreInventory": true,
"isPickupInventory": false
}
}
*/
My output when inspecting and running the same AJAX request via browser:
/*
{
"onlineInventory": {
"status": "Backordered",
"image": "widget_product_info/backordered_icon.svg",
"altText": "Backordered",
"isDropShip": false,
"quantity": 0,
"availableDate":"May 1-8"
},
"inStoreInventory": {
"stores": [{
id: '715839555',
name: '83',
gunRestricted: 'false',
dsName: 'TX - Round Rock',
status: 'Unavailable',
statusText: 'Out of Stock',
image: 'widget_product_info/outofstock_icon.svg',
altText: 'Out of Stock',
availableDate: '',
availableQuantity: '',
availableQuantityDisplay: 'false',
cityState: 'Round Rock, TX',
ISPavailableDate: '',
ISPavailableQuantity: '',
pickupTime: 'by 2:00pm',
offerISPOnBPS: 'Yes',
offerISPOnCAB: 'No'}],
"checkStoreText": "Change Store",
"isInStoreInventory": true,
"isPickupInventory": true
}
}
*/
I tried assigning cookies this way as well:
url = "https://www.basspro.com/shop/en/blazer-brass-handgun-ammo"
r = requests.get(url)
cookies = r.cookies
# fails to pass the right cookie
If I instead copy the cookie verbatim from an inspected GET request at https://www.basspro.com/shop/en/blazer-brass-handgun-ammo and put that into the POST headers, it works. How do I get cookies to work properly programatically?
EDIT:
Here's my attempt at just using Session() for cookies:
# python 3.7.3
import requests
from requests import Session
session = Session()
session.get("https://www.basspro.com/shop/en/blazer-brass-handgun-ammo")
# session.head('https://www.basspro.com/shop/en/blazer-brass-handgun-ammo')
response = session.post(
url='https://www.basspro.com/shop/BPSGetInventoryStatusByIDView',
data={
'productId': '3074457345616736172',
'itemId': '3074457345616736949',
'isGunFlag': 'false',
},
headers={
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'content-length': '72',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://www.basspro.com',
'referer': 'https://www.basspro.com/shop/en/blazer-brass-handgun-ammo',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.92 Safari/537.36 Vivaldi/2.9.1705.38',
'x-requested-with': 'XMLHttpRequest',
},
)
print(response.text)
I get the same result as before ("status": "Status Not Available", etc.)
Here's my attempt at the second solution:
# python 3.7.3
import requests
from requests import Session
url = "https://www.basspro.com/shop/en/blazer-brass-handgun-ammo"
r = requests.get(url)
cookies = r.cookies # the type is RequestsCookieJar
response = requests.post(
url='https://www.basspro.com/shop/BPSGetInventoryStatusByIDView',
data={
'productId': '3074457345616736172',
'itemId': '3074457345616736949',
'isGunFlag': 'false',
},
cookies=cookies,
headers={
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'content-length': '72',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://www.basspro.com',
'referer': 'https://www.basspro.com/shop/en/blazer-brass-handgun-ammo',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.92 Safari/537.36 Vivaldi/2.9.1705.38',
'x-requested-with': 'XMLHttpRequest',
},
)
print(response.text)
Again, I get the same result as before. What am I doing wrong?
can you try like this
session = Session()
session.get("https://www.basspro.com/shop/en/blazer-brass-handgun-ammo")
Then all the following calls with
session.xxx
donot use cookies parameter in it
another way I have tested,
cookies = r.cookies # the type is RequestsCookieJar
requests.post(.... cookies=cookies...)
at last ,I tested this works:
Please compare carefully
from requests import Session
session = Session()
agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
r1 = session.get("https://www.basspro.com/shop/en/blazer-brass-handgun-ammo",headers={'user-agent': agent})
response = session.post(
url='https://www.basspro.com/shop/BPSGetOnlineInventoryStatusByIDView',
data={
'productId': '3074457345616736172',
'catalogId': '3074457345616676768',
'storeId': '715838534',
'langId':-1
},
headers={
'user-agent': agent,
'x-requested-with': 'XMLHttpRequest',
},
cookies=r1.cookies
)
print(response.text)
I'm try to get some links which are includes specific class, therefore i writed this code:
from bs4 import BeautifulSoup
import requests
def getPages(requestedURLS):
_buff = []
for url in requestedURLS:
try:
_buff.append(requests.get(url))
except requests.exceptions.RequestException as err:
print(err)
return _buff
def getProductList(pages):
links = []
for page in pages:
content = BeautifulSoup(page.content, 'html.parser')
links.extend(content.find_all("a", class_="sresult lvresult clearfix li shic"))
print(links)
def main():
pageLinks = [
"https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pppn=r1&scp=ce0",
"https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pgn=2&_skc=200&rt=nc",
"https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pgn=3&_skc=400&rt=nc"
]
#Results in : <div id="Results" class="results "> <ul id="ListViewInner">
pages = getPages(pageLinks)
productList = getProductList(pages)
if __name__ == '__main__':
main()
You can check links there are lots of links which includes this class but output is empty as you see in below:
C:\Users\projects\getMarketData\venv\Scripts\python.exe C:/Users/projects/getMarketData/getData.py
[]
[]
[]
Process finished with exit code 0
What is wrong?
Using appropriate headers should do the trick:
url = 'https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pppn=r1&scp=ce0'
request_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.ebay.co.uk',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
r = requests.get(url, headers=request_headers)
soup = BeautifulSoup(r.content, 'lxml')
items = soup.find_all('li', {'class': 'sresult lvresult clearfix li shic'})
Result:
print(len(items)) # 18
print(items[0]['listingid']) # 333200565336
Use the network tab of the developer tools to inspect the traffic. Check this out if you've never done this before.
I'm trying to extract a json part of the script using beautiful soup but it prints Nothing. What's wrong?
import requests,json
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'referrer': 'https://google.com',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Pragma': 'no-cache',
}
r = requests.get('https://www.joox.com/id/id/single/xtMtD9ZdeLEdHp1w1fip8w==',headers=headers)
soup = BeautifulSoup(r.content, 'html5lib')
data = json.loads(soup.find('script').text)
print(data)
The soup result is like:
<script>
__NEXT_DATA__ = {"props":{"trackData":{"album_url":"https://imgcache.qq.com/music/photo/mid_album_300/O/F/002tkoBZ2zJPOF.jpg","code":0,"country":"hk","encodeSongId":"xtMtD9ZdeLEdHp1w1fip8w==","express_domain":"http://stream.music.joox.com/","flag":0,"gososo":0,"has_hifi":false,"has_hq":false,"imgSrc":"https://imgcache.qq.com/music/photo/mid_album_300/O/F/002tkoBZ2zJPOF.jpg","kbps_map":"{\"128\":4082449,\"192\":6522038,\"24\":825973,\"320\":10216761,\"48\":1708095,\"96\":3701266,\"ape\":0,\"flac\":0}\n","ktrack_id":0,"m4aUrl":"https://hk.stream.music.joox.com/C4000035Jew421IKj2.m4a?
Any ideas for this? thanks
You can use substring and regular expression for obtaining what you are seeking for.
import requests,json, re
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'referrer': 'https://google.com',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Pragma': 'no-cache',
}
r = requests.get('https://www.joox.com/id/id/single/xtMtD9ZdeLEdHp1w1fip8w==',headers=headers)
soup = BeautifulSoup(r.content, 'html5lib')
# finding the script element whihc contains the required data
json_data = soup.find('script', text=re.compile("__NEXT_DATA__") )
# identifying the required elements and obtaining it by sub string
data = str(json_data)[str(json_data).find('__NEXT_DATA__ = '):str(json_data).find('module={}')]
data = json.loads(data.replace('__NEXT_DATA__ = ', ''))
print(data)
Using slicing technique we are identifying the string pattern and extracting the json string alone. After that you are good to go!
Hope this helps! Cheers!
even BeautifulSoup still need Regex, why not use it directly
import re
....
r = requests.get('https://www.joox.com/id/id/single/xtMtD9ZdeLEdHp1w1fip8w==',headers=headers)
jsData = re.search(r'__NEXT_DATA__\s+=\s+(.*)', r.text)
data = json.loads(jsData.group(1))
print(data)