Beautifulsoup - Remove HTML tags - python

I am trying to strip away all the HTML tags from the ‘profile’ soup, whoever am I unable to perform the “.text.strip()” operation as it is a list, as shown in code below
import requests
from bs4 import BeautifulSoup
from pprint import pprint
page = requests.get("https://web.archive.org/web/20121007172955/http://www.nga.gov/collection/anZ1.htm").text
soup = BeautifulSoup(company_page, "html.parser")
info = {}
info['Profile'] = soup.select('div.text-desc-members')
pprint(info)

Just iterate through that list:
import requests
from bs4 import BeautifulSoup
from pprint import pprint
page = requests.get("https://web.archive.org/web/20121007172955/http://www.nga.gov/collection/anZ1.htm").text
soup = BeautifulSoup(page, "html.parser")
info = {}
info['Profile'] = soup.select('div.text-desc-members')
for item in info['Profile']:
pprint(item.text.strip())

Related

how to put web scraped data into a list

this is the code I used to get the data from a website with all the wordle possible words, im trying to put them in a list so I can create a wordle clone but I get a weird output when I do this. please help
import requests
from bs4 import BeautifulSoup
url = "https://raw.githubusercontent.com/tabatkins/wordle-list/main/words"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
word_list = list(soup)
It do not need BeautifulSoup, simply split the text of the response:
import requests
url = "https://raw.githubusercontent.com/tabatkins/wordle-list/main/words"
requests.get(url).text.split()
Or if you like to do it wit BeautifulSoup anyway:
import requests
from bs4 import BeautifulSoup
url = "https://raw.githubusercontent.com/tabatkins/wordle-list/main/words"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
soup.text.split()
Output:
['women',
'nikau',
'swack',
'feens',
'fyles',
'poled',
'clags',
'starn',...]

How to breakup webpage text

I assume I have to use the /br to break up this text and extract the fixtures only, but cannot figure it out at all!
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pandas as pd
import pprint
import json
from urllib.request import urlopen
# sample web page
sample_web_page = 'https://bleacherreport.com/articles/10005879-epl-schedule-2021-22-official-list-of-fixtures-for-new-premier-league-season'
# call get method to request that page
page = requests.get(sample_web_page)
# with the help of beautifulSoup and html parser create soup
soup = BeautifulSoup(page.content, "html.parser")
z = soup.findAll('p', {'class':''})
print(z)

DataLayer with python

is it possible to extract value from data layer?
This is url = https://www.repubblica.it/cronaca/2021/04/09/news/vaccini_ecco_chi_sono_gli_oltre_due_milioni_di_italiani_che_hanno_ricevuto_una_dose_fuori_dalle_liste_delle_priorita_-295650286/?ref=RHTP-BH-I0-P1-S1-T1
I need to extract "dateModified" from <script type="application/ld+json"
Thank you!
import requests
from bs4 import BeautifulSoup
import json
url='https://www.repubblica.it/cronaca/2021/04/09/news/vaccini_ecco_chi_sono_gli_oltre_due_milioni_di_italiani_che_hanno_ricevuto_una_dose_fuori_dalle_liste_delle_priorita_-295650286/?ref=RHTP-BH-I0-P1-S1-T1'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
script = soup.find_all('script')[0]
print(script.find('dateModified'))
Yes, you need to use the .string attribute and dump that to json.loads.
Here's how:
import json
import requests
from bs4 import BeautifulSoup
url='https://www.repubblica.it/cronaca/2021/04/09/news/vaccini_ecco_chi_sono_gli_oltre_due_milioni_di_italiani_che_hanno_ricevuto_una_dose_fuori_dalle_liste_delle_priorita_-295650286/?ref=RHTP-BH-I0-P1-S1-T1'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
print(json.loads(soup.find_all('script')[0].string)["dateModified"])
Output:
2021-04-09T10:26:13Z

BS4 returns [] instead of the wanted HTML tag

I want to parse the given website and scrape the table. To me the code looks right. New to python and web parsing
import requests
from bs4 import BeautifulSoup
response = requests.get('https://delhifightscorona.in/')
doc = BeautifulSoup(response.text, 'lxml-xml')
cases = doc.find_all('div', {"class": "cell"})
print(cases)
doing this returns
[]
Change your parser and the class and there you have it.
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(requests.get('https://delhifightscorona.in/').text, 'html.parser').find('div', {"class": "grid-x grid-padding-x small-up-2"})
print(soup.find("h3").getText())
Output:
423,831
You can choose to print only the cases or the total stats with the date.
import requests
from bs4 import BeautifulSoup
response = requests.get('https://delhifightscorona.in/')
doc = BeautifulSoup(response.text, 'html.parser')
stats = doc.find('div', {"class": "cell medium-5"})
print(stats.text) #Print the whole block with dates and the figures
cases = stats.find('h3')
print(cases.text) #Print the cases only

list out of range error when using soup.select('placeholder')[0].get_text() in Beautiful soup

New to scraping and I'm trying to use Beautiful soup to get the Wheelbase value ( eventually other things) from a wikipedia page ( I'll deal with robots.txt later) This is the guide I've been using
Two questions
1.) How do I resolve the error below?
2.) How do I scrape the value in the cell that contains wheelbase is it just "td#Wheelbase td" ?
The error I get is
File "evscraper.py", line 25, in <module>
wheelbase_data['Wheelbase'] = soup.select('div#Wheelbase h3') [0].get_text()
IndexError: list index out of range
Thanks for any help!
__author__ = 'KirkLazarus'
import re
import json
import gspread
from oauth2client.client import SignedJwtAssertionCredentials
import bs4
from bs4 import BeautifulSoup
import requests
response =requests.get ('https://en.wikipedia.org/wiki/Tesla_Model_S')
soup = bs4.BeautifulSoup(response.text)
wheelbase_data['Wheelbase'] = soup.select('div#Wheelbase h3')[0].get_text()
print wheelbase_data
Well your first problem is with your selector. There's no div with the ID of "Wheelbase" on that page, so it's returning an empty list.
What follows is by no means perfect, but will get you what you want, only because you know the structure of the page already:
import re
import json
import gspread
from oauth2client.client import SignedJwtAssertionCredentials
import bs4
from bs4 import BeautifulSoup
import requests
wheelbase_data = {}
response =requests.get ('https://en.wikipedia.org/wiki/Tesla_Model_S')
soup = bs4.BeautifulSoup(response.text)
for link in soup.find_all('a'):
if link.get('href') == "/wiki/Wheelbase":
wheelbase = link
break
wheelbase_data['Wheelbase'] = wheelbase.parent.parent.td.text
It looks like you're looking for the incorrect path. I've had to do something similar in the past.. I'm not sure if this is the best approach but certainly worked for me.
import pandas as pd
from bs4 import BeautifulSoup
import urllib2
car_data = pd.DataFrame()
models = ['Tesla_Model_S','Tesla_Model_X']
for model in models:
wiki = "https://en.wikipedia.org/wiki/{0}".format(model)
header = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
table = soup.find("table", { "class" : "infobox hproduct" })
for row in table.findAll("tr")[2:]:
try:
field = row.findAll("th")[0].text.strip()
val = row.findAll("td")[0].text.strip()
car_data.set_value(model,field,val)
except:
pass
car_data

Categories

Resources