I've created a script in python using post requests to fetch the name of different suppliers from a webpage but unfortunately I'm getting this error AttributeError: 'NoneType' object has no attribute 'text' whereas it occured to me that I did things in the right way.
websitelink
To populate the content, it is required to click on the search button just the way it is seen in the image.
I've tried so far:
import requests
from bs4 import BeautifulSoup
url = "https://www.gebiz.gov.sg/ptn/supplier/directory/index.xhtml"
r = requests.get(url)
soup = BeautifulSoup(r.text,"lxml")
payload = {
'contentForm': 'contentForm',
'contentForm:j_idt225_listButton2_HIDDEN-INPUT': '',
'contentForm:j_idt161_inputText': '',
'contentForm:j_idt164_SEARCH': '',
'contentForm:j_idt167_selectManyMenu_SEARCH-INPUT': '',
'contentForm:j_idt167_selectManyMenu-HIDDEN-INPUT': '',
'contentForm:j_idt167_selectManyMenu-HIDDEN-ACTION-INPUT': '',
'contentForm:search': 'Search',
'contentForm:j_idt185_select': 'SUPPLIER_NAME',
'javax.faces.ViewState': soup.select_one('[id="javax.faces.ViewState"]')['value']
}
res = requests.post(url,data=payload,headers={
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
})
sauce = BeautifulSoup(res.text,"lxml")
item = sauce.select_one(".form2_ROW").text
print(item)
Only this portion will do as well: 8121 results found.
Full traceback:
Traceback (most recent call last):
File "C:\Users\WCS\AppData\Local\Programs\Python\Python37-32\general_demo.py", line 27, in <module>
item = sauce.select_one(".form2_ROW").text
AttributeError: 'NoneType' object has no attribute 'text'
You need to find a way to get the cookie. The following currently works for me across multiple requests.
import requests
from bs4 import BeautifulSoup
url = "https://www.gebiz.gov.sg/ptn/supplier/directory/index.xhtml"
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0',
'Referer' : 'https://www.gebiz.gov.sg/ptn/supplier/directory/index.xhtml',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding' : 'gzip, deflate, br',
'Accept-Language' : 'en-US,en;q=0.9',
'Cache-Control' : 'max-age=0',
'Connection' : 'keep-alive',
'Cookie' : '__cfduid=d3fe47b7a0a7f3ef307c266817231b5881555951761; wlsessionid=pFpF87sa9OCxQhUzwQ3lXcKzo04j45DP3lIVYylizkFMuIbGi6Ka!1395223647; BIGipServerPTN2_PRD_Pool=52519072.47873.0000'
}
with requests.Session() as s:
r = s.get(url, headers= headers)
soup = BeautifulSoup(r.text,"lxml")
payload = {
'contentForm': 'contentForm',
'contentForm:search': 'Search',
'contentForm:j_idt185_select': 'SUPPLIER_NAME',
'javax.faces.ViewState': soup.select_one('[id="javax.faces.ViewState"]')['value']
}
res = s.post(url,data=payload,headers= headers)
sauce = BeautifulSoup(res.text,"lxml")
item = sauce.select_one(".formOutputText_HIDDEN-LABEL.outputText_TITLE-BLACK").text
print(item)
Related
So I'm currently trying to cart an item on supreme through requests. After posting the carting request I don't get any errors but just that it didn't work as a response.
#imports
import requests
from bs4 import BeautifulSoup
#constants
baseurl = "https://www.supremenewyork.com/"
product_category = ""
size = ["Medium"]
product_keywords = ["Supreme®/The North Face® Steep Tech Fleece Pant"]
product_style = ["Brown"]
#functions
def carting(url):
session = requests.Session()
r = session.get(baseurl+url)
soup = BeautifulSoup(r.text, "html.parser")
name = soup.find("h1", {"itemprop" : "name"}).text
style = soup.find("p", {"itemprop" : "model"}).text
for keyword in product_keywords:
if keyword in name:
for keyword in product_style:
if keyword in style:
print("Product Found! Adding to cart...")
form = soup.find("form", {"id" : "cart-add"})
payload = {
"utf8" : "✓",
"authenticity_token" : form.find("input", {"name" : "authenticity_token"})["value"],
"style" : form.find("input", {"name" : "style"})["value"],
"size" : "92001", #form.find("select", {"id" : "size"})["value"] Need to rework getting Size through Keyword
"qty" : "1"
}
headers = {
"user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
"origin" : "https://www.supremenewyork.com",
"referer" : baseurl + url,
"path" : form["action"],
'Host': 'www.supremenewyork.com',
'Accept': 'application/json',
'Proxy-Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-us',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'http://www.supremenewyork.com',
'Connection': 'keep-alive',
}
response = session.post(baseurl+form["action"], data=payload)
print(response.text)
return(session)
carting("/shop/pants/mj1czv0pa/jcyp91a8w")
The answer I get printed is
Product Found! Adding to cart...
{"cart":[],"success":false}
I wondered if I just have to gamble on which of all the headers I have to include so it works since maybe the site wants to have a certain or maybe all headers included that would be sent manually.
Help appreciated!
I would like to have the name of the hotel, usually I have no problem with this kind of scraping but here it doesn't work I don't understand.
Here's my script:
import numpy as np
import time
from random import randint
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import random
#headers= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
'Referer': 'https://www.espncricinfo.com/',
'Upgrade-Insecure-Requests': '1',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
url = 'https://www.booking.com/hotel/fr/hyatt-regency-paris-etoile.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1619708145;srpvid=6f6268f8305e011d;type=total;ucfs=1&#hotelTmpl'
results = requests.get(url, headers = headers)
soup = BeautifulSoup(results.text, "html.parser")
hotel = soup.find('h2', class_ = 'hp__hotel-name').text
print(hotel)
Here's the error :
Traceback (most recent call last):
File "test_booking_info_supp.py", line 75, in <module>
hotel = soup.find('h2', class_ = 'hp__hotel-name').text
AttributeError: 'NoneType' object has no attribute 'text'
I cannot understand why I obtained None, here's the html:
The link to the website is in the image
You can try out with id attribute from tag to get the text value
url = 'https://www.booking.com/hotel/fr/hyatt-regency-paris-etoile.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1619708145;srpvid=6f6268f8305e011d;type=total;ucfs=1&#hotelTmpl'
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
hotel = soup.find("h2",attrs={"id":"hp_hotel_name"})
print(hotel.text.strip("\n").split("\n")[1])
Output:
'Hyatt Regency Paris Etoile'
There is this site called dnsdumpster that provides all the sub-domains for a domain. I am trying to automate this process and print out a list of the subdomains. Each individual sub-domain is within the "td" HTML tag. I am trying to iterate through all these tags and print out the sub-domains, but I get an error.
import requests
import re
from bs4 import BeautifulSoup
headers = {
'Host' : 'dnsdumpster.com',
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language' : 'en-US,en;q=0.5',
'Accept-Encoding' : 'gzip, deflate',
'DNT' : '1',
'Upgrade-Insecure-Requests' : '1',
'Referer' : 'https://dnsdumpster.com/',
'Connection' : 'close'
}
proxies = {
'http' : 'http://127.0.0.1:8080'
}
domain = 'google.com'
with requests.Session() as s:
url = 'https://dnsdumpster.com'
response = s.get(url, headers=headers, proxies=proxies)
response.encoding = 'utf-8' # Optional: requests infers this internally
soup1 = BeautifulSoup(response.text, 'html.parser')
input = soup1.find_all('input')
csrfmiddlewaretoken_raw = str(input[0])
csrfmiddlewaretoken = csrfmiddlewaretoken_raw[55:119]
data = {
'csrfmiddlewaretoken' : csrfmiddlewaretoken,
'targetip' : domain
}
send_data = s.post(url, data=data, proxies=proxies, headers=headers)
print(send_data.status_code)
soup2 = BeautifulSoup(send_data.text, 'html.parser')
td = soup2.find_all('td')
for i in len(td):
item = str(td[i])
subdomain = item[21:37]
print(subdomain)
Error looks like this:
Traceback (most recent call last): File "dns_dumpster_4.py", line
39, in
for i in len(td): TypeError: 'int' object is not iterable
And once the above error is solve, I would also need help with another question:
How can I use a regular expression to get the individual sub-domain from within this "td" tag, because the contents of this tag is very long and messy and I only need the subdomain. I would really appreciate it, if some could help me with a simple get the sub-domain name only.
I try to catch subdomain with out using regex.
import requests
from bs4 import BeautifulSoup
headers = {
'Host' : 'dnsdumpster.com',
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language' : 'en-US,en;q=0.5',
'Accept-Encoding' : 'gzip, deflate',
'DNT' : '1',
'Upgrade-Insecure-Requests' : '1',
'Referer' : 'https://dnsdumpster.com/',
'Connection' : 'close'
}
proxies = {
'http' : 'http://127.0.0.1:8080'
}
domain = 'google.com'
with requests.Session() as s:
url = 'https://dnsdumpster.com'
response = s.get(url, headers=headers, proxies=proxies)
response.encoding = 'utf-8' # Optional: requests infers this internally
soup1 = BeautifulSoup(response.text, 'html.parser')
input = soup1.find_all('input')
csrfmiddlewaretoken_raw = str(input[0])
csrfmiddlewaretoken = csrfmiddlewaretoken_raw[55:119]
data = {
'csrfmiddlewaretoken' : csrfmiddlewaretoken,
'targetip' : domain
}
send_data = s.post(url, data=data, proxies=proxies, headers=headers)
print(send_data.status_code)
soup2 = BeautifulSoup(send_data.text, 'html.parser')
td = soup2.find_all('td', {'class': 'col-md-3'})
# for dom in range(0, len(td),2):
# print(td[dom].get_text(strip=True, separator='\n'))
mysubdomain = []
for dom in range( len(td)):
# print(td[dom].get_text(strip=True, separator='\n'))
if '.' in td[dom].get_text(strip=True):
x = td[dom].get_text(strip=True, separator=',').split(',')
mysubdomain.append(x)
# print(x)
# y = td[dom].get_text(strip=True, separator=',').split(',')[1]
# mysubdomain.append(td[dom].get_text(strip=True, separator=','))
print(mysubdomain)
# print(td)
# for i in range(len(td)):
# item = str(td[i])
# print('\n', item, '\n')
# subdomain = item[21:37]
# print(subdomain)
from functools import reduce
flat_list_of_mysubdomain = reduce(lambda x, y: x + y, mysubdomain)
print(flat_list_of_mysubdomain)
I hope its help you.
I'm trying to fetch some json response from a webpage using the script below. Here are the steps to populate the result in that site. Click on the AGREE button located at the bottom of this webpage and then on the EDIT SEARCH button and finally on SHOW RESULTS button without changing anything.
I've tried like this:
import requests
from bs4 import BeautifulSoup
url = 'http://finra-markets.morningstar.com/BondCenter/Results.jsp'
post_url = 'http://finra-markets.morningstar.com/bondSearch.jsp'
payload = {
'postData': {'Keywords':[]},
'ticker': '',
'startDate': '',
'endDate': '',
'showResultsAs': 'B',
'debtOrAssetClass': '1,2',
'spdsType': ''
}
payload_second = {
'count': '20',
'searchtype': 'B',
'query': {"Keywords":[{"Name":"debtOrAssetClass","Value":"3,6"},{"Name":"showResultsAs","Value":"B"}]},
'sortfield': 'issuerName',
'sorttype': '1',
'start': '0',
'curPage': '1'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
s.headers['Referer'] = 'http://finra-markets.morningstar.com/BondCenter/UserAgreement.jsp'
r = s.post(url,json=payload)
s.headers['Access-Control-Allow-Headers'] = r.headers['Access-Control-Allow-Headers']
s.headers['cf-request-id'] = r.headers['cf-request-id']
s.headers['CF-RAY'] = r.headers['CF-RAY']
s.headers['X-Requested-With'] = 'XMLHttpRequest'
s.headers['Origin'] = 'http://finra-markets.morningstar.com'
s.headers['Referer'] = 'http://finra-markets.morningstar.com/BondCenter/Results.jsp'
r = s.post(post_url,json=payload_second)
print(r.content)
This is the result I get when I run the script above:
b'\n\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\n\n\n{}'
How can I make the script populate expected result from that site?
P.S. I do not wish to go for selenium to get this done.
The response for http://finra-markets.morningstar.com/BondCenter/Results.jsp doesn't contain the search results. It must be fetching the data asynchronously.
An easy way to find out which network requests returned the search results, is to search the requests for one of the search results using Firefox's Dev Tools:
To convert the HTTP request to a Python request, I copy the request as a CURL code method from Firefox, import it into Postman and then export it as a Python code (a little long-winded (and lazy) I know!):
All this leads to the following code:
import requests
url = "http://finra-markets.morningstar.com/bondSearch.jsp"
payload = "count=20&searchtype=B&query=%7B%22Keywords%22%3A%5B%7B%22Name%22%3A%22debtOrAssetClass%22%2C%22Value%22%3A%223%2C6%22%7D%2C%7B%22Name%22%3A%22showResultsAs%22%2C%22Value%22%3A%22B%22%7D%5D%7D&sortfield=issuerName&sorttype=1&start=0&curPage=1"
headers = {
'User-Agent': "...",
'Accept': "text/plain, */*; q=0.01",
'Accept-Language': "en-US,en;q=0.5",
'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8",
'X-Requested-With': "XMLHttpRequest",
'Origin': "http://finra-markets.morningstar.com",
'DNT': "1",
'Connection': "keep-alive",
'Referer': "http://finra-markets.morningstar.com/BondCenter/Results.jsp",
'Cookie': "...",
'cache-control': "no-cache"
}
response = requests.request("POST", url, data=payload, headers=headers)
print(response.text)
The response wasn't 100% JSON. So I just stripped away the outer whitespace and {B:..} part:
>>> text = response.text.strip()[3:-1]
>>> import json
>>> data = json.loads(text)
>>> data['Columns'][0]
{'moodyRating': {'ratingText': '', 'ratingNumber': 0},
'fitchRating': {'ratingText': None, 'ratingNumber': None},
'standardAndPoorRating': {'ratingText': '', 'ratingNumber': 0},
I have added few lines of code to my program to convert a html to json using BeautifulSoup but getting an error for those added lines of code.
import httplib, urllib
from bs4 import BeautifulSoup
import json
params = urllib.urlencode({'cmm': 'onion', 'mkt': '', 'search': ''})
headers = {'Cookie': 'ASPSESSIONIDCCRBQBBS=KKLPJPKCHLACHBKKJONGLPHE; ASP.NET_SessionId=kvxhkhqmjnauyz55ult4hx55; ASPSESSIONIDAASBRBAS=IEJPJLHDEKFKAMOENFOAPNIM','Origin': 'http://agmarknet.nic.in', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-GB,en-US;q=0.8,en;q=0.6','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Cache-Control': 'max-age=0','Referer': 'http://agmarknet.nic.in/mark2_new.asp','Connection': 'keep-alive'}
conn = httplib.HTTPConnection("agmarknet.nic.in")
conn.request("POST", "/SearchCmmMkt.asp", params, headers)
response = conn.getresponse()
print response.status, response.reason
data = response.read()
htmldata = [[cell.text for cell in row("td")]for row in BeautifulSoup((data)("tr"),"lxml")]
x = json.dumps(dict(htmldata))
print x
I am getting an error as
Traceback (most recent call last):
File "commodity.py", line 12, in <module>
data1 = [[cell.text for cell in row("td")]for row in BeautifulSoup((data)("tr"),"lxml")]
TypeError: 'str' object is not callable`enter code here`
on running the code. Can anyone tell me the correct approach to resolve this error.
You are trying to 'call' a string here:
BeautifulSoup((data)("tr"),"lxml")
(data) is a string, and (data)("tr") is a call to the string.
Perhaps you wanted to find all <tr> elements:
BeautifulSoup(data, "lxml").find_all("tr")
making the full statement:
htmldata = [[cell.text for cell in row("td")] for row in BeautifulSoup(data, "lxml").find_all("tr")]