why is my scraper showing 'none' as the output?

why is my scraper showing 'none' as the output? - python

from bs4 import BeautifulSoup
import requests
url = 'https://www.flipkart.com'
data = {
'q': 'laptops'
}
html_code = requests.post(url, data=data).text
soup = BeautifulSoup(html_code, 'lxml')
titles = soup.findAll('div', class_='_3wU53n')
for title in titles:
print(title, end='\n')

you can try this code
import requests
from bs4 import BeautifulSoup
to_search = 'laptops'
url = 'https://www.flipkart.com/search?q=
{}&otracker=search&otracker1=search&marketplace=FLIPKART&as-
show=on&as=off'.format(to_search)
html_code = requests.get(url).text
soup = BeautifulSoup(html_code, 'lxml')
titles = soup.findAll('div', class_='_3wU53n')
for title in titles:
print(title, end='\n')

Can you please check again. Firstly, The url 'https://www.flipkart.com' is actually a 'GET' request and not 'POST'. Secondly, the class: _3wU53n which you have provided is actually not present in the HTML DOM. Have a look in the below screenshot for the 'GET' request and 'Absence of class: _3wU53n' in the HTML DOM.

Related

how to get all 'href' from page in company profile box

Can anyone tell me where is the prblm i am new in python i want get all links from this page here is my code
import requests
from bs4 import BeautifulSoup
import pandas as pd
re=requests.get('https://www.industrystock.com/en/companies/Agriculture')
re
soup = BeautifulSoup(re.text, 'lxml')
link_list = []
page1 = soup.find_all('a', class_ = 'btn awe-info gotoJS iconColor_white')
page1
for i in page1:
link = (i.get('href'))
link_list.append(link)

The links to company profiles are stored in data-href= attribute:
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.industrystock.com/en/companies/Agriculture")
soup = BeautifulSoup(r.content, "lxml")
page1 = soup.find_all("a", class_="btn awe-info gotoJS iconColor_white")
for i in page1:
print(i["data-href"])
Prints:
https://www.industrystock.com/en/company/profile/ARCA-Internet-Services-Ltd./370071
https://www.industrystock.com/en/company/profile/Забайкальская-аграрная-Ассоциация-образовательных-и-научных-учреждений/256182
https://www.industrystock.com/en/company/profile/...VÁŠ-INTERIÉR-s.r.o./534809
https://www.industrystock.com/en/company/profile/1-WITOS-s.r.o./529071
https://www.industrystock.com/en/company/profile/1.-TOUŠEŇSKÁ-s.r.o./544981
https://www.industrystock.com/en/company/profile/1.HEFAISTOS-s.r.o./541263
https://www.industrystock.com/en/company/profile/1.HRADECKÁ-ZEMĚDĚLSKÁ-a.s./548267
https://www.industrystock.com/en/company/profile/1.MAXIMA-INTERNATIONAL-s.r.o./530049
https://www.industrystock.com/en/company/profile/1.MIROSLAVSKÁ-STROJÍRNA-spol.-s-r.o./544781
https://www.industrystock.com/en/company/profile/1.VASTO-spol.-s-r.o./535985
https://www.industrystock.com/en/company/profile/1C-PRO-s.r.o./534831
https://www.industrystock.com/en/company/profile/1CSC-a.s./528169
https://www.industrystock.com/en/company/profile/1P-CONTROL/549995
https://www.industrystock.com/en/company/profile/2-ES-spol.-s-r.o./547849
https://www.industrystock.com/en/company/profile/2-G-SERVIS-spol.-s-r.o./528391
https://www.industrystock.com/en/company/profile/2-JCP-a.s./537151
https://www.industrystock.com/en/company/profile/2-THETA-ASE-s.r.o./545079
https://www.industrystock.com/en/company/profile/2LMAKERS-s.r.o./542127
https://www.industrystock.com/en/company/profile/2M-SERVIS-s.r.o./550923
https://www.industrystock.com/en/company/profile/2M-STATIC-s.r.o./549935
https://www.industrystock.com/en/company/profile/2M-STROJE-s.r.o./539885
https://www.industrystock.com/en/company/profile/2TMOTORS-s.r.o./543869
https://www.industrystock.com/en/company/profile/2VV-s.r.o./538993
https://www.industrystock.com/en/company/profile/2xSERVIS-s.r.o./528321
https://www.industrystock.com/en/company/profile/3-PLUS-1-SERVICE-s.r.o./535103
https://www.industrystock.com/en/company/profile/3-TOOLING-s.r.o./540599
https://www.industrystock.com/en/company/profile/3B-SOCIÁLNÍ-FIRMA-s.r.o./535127
https://www.industrystock.com/en/company/profile/3D-KOVÁRNA-s.r.o./549765
https://www.industrystock.com/en/company/profile/3D-TECH-spol.-s-r.o./548047
https://www.industrystock.com/en/company/profile/3DNC-SYSTEMS-s.r.o./549379

Try this:
response = requests.get('https://www.industrystock.com/en/companies/Agriculture')
soup = BeautifulSoup(response.text, 'lxml')
link_list = []
page1 = soup.find_all('a', {"class":'btn awe-info gotoJS iconColor_white'})
for i in page1:
link = i['href']
link_list.append(link)
And I would also recommend using html.parser if you are not scraping XML.

Trying to scrape a particular part of the whole HTML

I want to scrape out only the highlighted part of this:
I am using the following code:
import requests
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata(
"https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(htmldata, 'html.parser')
for item in soup.select('td'):
print(item.text)
How do I scrape only the required part of the entire <td> tag?
Thanks in advance.

Try this...
from bs4 import BeautifulSoup
r = requests.get("https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(r.text, "lxml")
emojiDescription = []
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for tabledata in tableRow.findChildren("td"):
if tabledata.has_attr("id"):
k = tabledata.text.strip().split('\n')[-1]
l = k.lstrip()
print(emojiDescription)

This works for me to get the highlighted text-
import requests
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata(
"https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(htmldata, 'html.parser')
for item in soup.findAll('td'):
print((item.text.strip().split('\n'))[-1])

Assuming you can use the name to look up the description, you can combine :contains with chained next_sibling
import requests
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata(
"https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(htmldata, 'html.parser')
print(soup.select_one('b:contains("Smiling Face With Open Mouth")').next_sibling.next_sibling.strip())
If not, you can use positional matching with nth-of-type
print(soup.select_one('tr:nth-of-type(2) b').next_sibling.next_sibling.strip())

deleting some parts of requested href attrs in bs4

I need slugs of all articles on a page. I used bs4 to get href contents of all articles, but some article's link has another URL which I don't need it. I want to delete those items. I used this code:
import requests
import re
from bs4 import BeautifulSoup
r = requests.get('https://davidventuri.medium.com/')
soup = BeautifulSoup(r.text, 'html.parser')
all_slugs = soup.find_all('a', {'class': 'dn br'})
for i in range(len(all_slugs)):
slug = all_slugs[i]['href']
print(slug)
Here is my result of getting hrefs:
/this-is-not-a-real-data-science-degree-d170c660c1cf
/not-a-real-degree-data-science-curriculum-2021-19ba9af2c1d4
/bitcoin-learning-path-9ed73f2f11d9
/your-first-day-of-school-eaf363b19ded
https://medium.com/free-code-camp/an-overview-of-every-data-visualization-course-on-the-internet-9ccf24ea9c9b
https://medium.com/free-code-camp/the-best-data-science-courses-on-the-internet-ranked-by-your-reviews-6dc5b910ea40
https://medium.com/free-code-camp/every-single-machine-learning-course-on-the-internet-ranked-by-your-reviews-3c4a7b8026c0
https://medium.com/free-code-camp/dive-into-deep-learning-with-these-23-online-courses-bf247d289cc0
/how-ai-is-revolutionizing-mental-health-care-a7cec436a1ce
https://medium.com/free-code-camp/i-ranked-all-the-best-data-science-intro-courses-based-on-thousands-of-data-points-db5dc7e3eb8e
Actually I want them as below:
/this-is-not-a-real-data-science-degree-d170c660c1cf
/not-a-real-degree-data-science-curriculum-2021-19ba9af2c1d4
/bitcoin-learning-path-9ed73f2f11d9
/your-first-day-of-school-eaf363b19ded
/an-overview-of-every-data-visualization-course-on-the-internet-9ccf24ea9c9b
/the-best-data-science-courses-on-the-internet-ranked-by-your-reviews-6dc5b910ea40
/every-single-machine-learning-course-on-the-internet-ranked-by-your-reviews-3c4a7b8026c0
/dive-into-deep-learning-with-these-23-online-courses-bf247d289cc0
/how-ai-is-revolutionizing-mental-health-care-a7cec436a1ce
/i-ranked-all-the-best-data-science-intro-courses-based-on-thousands-of-data-points-db5dc7e3eb8e
How can I delete them with regex or sth else?

If the substring to replace is always the same, you can go without regex like this:
slug = a['href'].replace('https://medium.com/free-code-camp','')
Example
import requests
from bs4 import BeautifulSoup
r = requests.get('https://davidventuri.medium.com/')
soup = BeautifulSoup(r.text, 'html.parser')
all_slugs = soup.find_all('a', {'class': 'dn br'})
for a in all_slugs:
slug = a['href'].replace('https://medium.com/free-code-camp','')
print(slug)
Output
/this-is-not-a-real-data-science-degree-d170c660c1cf
/not-a-real-degree-data-science-curriculum-2021-19ba9af2c1d4
/bitcoin-learning-path-9ed73f2f11d9
/your-first-day-of-school-eaf363b19ded
/an-overview-of-every-data-visualization-course-on-the-internet-9ccf24ea9c9b
/the-best-data-science-courses-on-the-internet-ranked-by-your-reviews-6dc5b910ea40
/every-single-machine-learning-course-on-the-internet-ranked-by-your-reviews-3c4a7b8026c0
/dive-into-deep-learning-with-these-23-online-courses-bf247d289cc0
/how-ai-is-revolutionizing-mental-health-care-a7cec436a1ce
/i-ranked-all-the-best-data-science-intro-courses-based-on-thousands-of-data-points-db5dc7e3eb8e
Edit
You can also use split()
slug = a['href'].split('/')[-1]
Example
import requests
from bs4 import BeautifulSoup
r = requests.get('https://davidventuri.medium.com/')
soup = BeautifulSoup(r.text, 'html.parser')
all_slugs = soup.find_all('a', {'class': 'dn br'})
for a in all_slugs:
slug = a['href'].split('/')[-1]
print(slug)

How to get just links of articles in list using BeautifulSoup

Hey guess so I got as far as being able to add the a class to a list. The problem is I just want the href link to be added to the links_with_text list and not the entire a class. What am I doing wrong?
from bs4 import BeautifulSoup
from requests import get
import requests
URL = "https://news.ycombinator.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id = 'hnmain')
articles = results.find_all(class_="title")
links_with_text = []
for article in articles:
link = article.find('a', href=True)
links_with_text.append(link)
print('\n'.join(map(str, links_with_text)))
This prints exactly how I want the list to print but I just want the href from every a class not the entire a class. Thank you

To get all links from the https://news.ycombinator.com, you can use CSS selector 'a.storylink'.
For example:
from bs4 import BeautifulSoup
from requests import get
import requests
URL = "https://news.ycombinator.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
links_with_text = []
for a in soup.select('a.storylink'): # <-- find all <a> with class="storylink"
links_with_text.append(a['href']) # <-- note the ['href']
print(*links_with_text, sep='\n')
Prints:
https://blog.mozilla.org/futurereleases/2020/06/18/introducing-firefox-private-network-vpns-official-product-the-mozilla-vpn/
https://mxb.dev/blog/the-return-of-the-90s-web/
https://github.blog/2020-06-18-introducing-github-super-linter-one-linter-to-rule-them-all/
https://www.sciencemag.org/news/2018/11/why-536-was-worst-year-be-alive
https://www.strongtowns.org/journal/2020/6/16/do-the-math-small-projects
https://devblogs.nvidia.com/announcing-cuda-on-windows-subsystem-for-linux-2/
https://lwn.net/SubscriberLink/822568/61d29096a4012e06/
https://imil.net/blog/posts/2020/fakecracker-netbsd-as-a-function-based-microvm/
https://jepsen.io/consistency
https://tumblr.beesbuzz.biz/post/621010836277837824/advice-to-young-web-developers
https://archive.org/search.php?query=subject%3A%22The+Navy+Electricity+and+Electronics+Training+Series%22&sort=publicdate
https://googleprojectzero.blogspot.com/2020/06/ff-sandbox-escape-cve-2020-12388.html?m=1
https://apnews.com/1da061ce00eb531291b143ace0eed1c9
https://support.apple.com/library/content/dam/edam/applecare/images/en_US/appleid/android-apple-music-account-payment-none.jpg
https://standpointmag.co.uk/issues/may-june-2020/the-healing-power-of-birdsong/
https://steveblank.com/2020/06/18/the-coming-chip-wars-of-the-21st-century/
https://www.videolan.org/security/sb-vlc3011.html
https://onesignal.com/careers/2023b71d-2f44-4934-a33c-647855816903
https://www.bbc.com/news/world-europe-53006790
https://github.com/efficient/HOPE
https://everytwoyears.org/
https://www.historytoday.com/archive/natural-histories/intelligence-earthworms
https://cr.yp.to/2005-590/powerpc-cwg.pdf
https://quantum.country/
http://www.crystallography.net/cod/
https://parkinsonsnewstoday.com/2020/06/17/tiny-magnetically-powered-implant-may-be-future-of-deep-brain-stimulation/
https://spark.apache.org/releases/spark-release-3-0-0.html
https://arxiv.org/abs/1712.09624
https://www.washingtonpost.com/technology/2020/06/18/data-privacy-law-sherrod-brown/
https://blog.chromium.org/2020/06/improving-chromiums-browser.html

BeautifulSoup and scraping href's isn't working

Again I am having trouble scraping href's in BeautifulSoup. I have a list of pages that I am scraping and I have the data but I can't seem to get the hrefs even when I use various codes that work in other scripts.
So here is the code and my data will be below that:
import requests
from bs4 import BeautifulSoup
with open('states_names.csv', 'r') as reader:
states = [states.strip().replace(' ', '-') for states in reader]
url = 'https://www.hauntedplaces.org/state/alabama'
for state in states:
page = requests.get(url+state)
soup = BeautifulSoup(page.text, 'html.parser')
links = soup.findAll('div', class_='description')
# When I try to add .get('href') I get a traceback error. Am I trying to scrape the href too early?
h_page = soup.findAll('h3')
<h3>Gaines Ridge Dinner Club</h3>
<h3>Purifoy-Lipscomb House</h3>
<h3>Kate Shepard House Bed and Breakfast</h3>
<h3>Cedarhurst Mansion</h3>
<h3>Crybaby Bridge</h3>
<h3>Gaineswood Plantation</h3>
<h3>Mountain View Hospital</h3>

This works perfectly:
from bs4 import BeautifulSoup
import requests
url = 'https://www.hauntedplaces.org/state/Alabama'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
for link in soup.select('div.description a'):
print(link['href'])

Try that:
soup = BeautifulSoup(page.content, 'html.parser')
list0 = []
possible_links = soup.find_all('a')
for link in possible_links:
if link.has_attr('href'):
print (link.attrs['href'])
list0.append(link.attrs['href'])
print(list0)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

why is my scraper showing 'none' as the output? - python

from bs4 import BeautifulSoup import requests url = 'https://www.flipkart.com' data = { 'q': 'laptops' } html_code = requests.post(url, data=data).text soup = BeautifulSoup(html_code, 'lxml') titles = soup.findAll('div', class_='_3wU53n') for title in titles: print(title, end='\n')

Related

how to get all 'href' from page in company profile box

Trying to scrape a particular part of the whole HTML

deleting some parts of requested href attrs in bs4

How to get just links of articles in list using BeautifulSoup

BeautifulSoup and scraping href's isn't working

Categories

Resources