How to pretty print BeautifulSoup's string output - python

I have the following scraping code:
import requests, bs4
def make_soup():
url = 'https://www.airbnb.pl/s/Girona--Hiszpania/homes?place_id=ChIJRRrTHsPNuhIRQMqjIeD6AAM&query=Girona%2C%20Hiszpania&refinement_paths%5B%5D=%2Fhomes&allow_override%5B%5D=&s_tag=b5bnciXv'
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")
return soup
def get_listings():
soup = make_soup()
listings = soup.select('._f21qs6')
number_of_listings = len(listings)
print("Current number of listings: " + str(number_of_listings))
while number_of_listings != 18:
print("Too few listings: " + str(number_of_listings))
soup = make_soup()
listings = soup.select('._f21qs6')
number_of_listings = len(listings)
print("All fine! The number of listings is: " + str(number_of_listings))
return listings
new_listings = get_listings()
print(new_listings)
I think def get_listings() returns listings as string, so I cannot use BeautifulSoup's prettify() on it and new_listings gets printed as one block of text.
Is there any way to print new_listings in HTML-esque format or at least have each tag printed at separate line?

type(new_listings)
# list
Shows that new_listings is a list. Try:
print(new_listings[0].prettify())

Try:
from pprint import pprint
pprint(new_listings)
pprint beautifies much outputs cleanly.

Related

Web scraping from multiple pages with for loop part 2

My original problem:
"I have created web scraping tool for picking data from listed houses.
I have problem when it comes to changing page. I did make for loop to go from 1 to some number.
Problem is this: In this web pages last "page" can be different all the time. Now it is 70, but tomorrow it can be 68 or 72. And if I but range for example to (1-74) it will print last page many times, because if you go over the maximum the page always loads the last."
Then I got help from Ricco D who wrote code that it will know when to stop:
import requests
from bs4 import BeautifulSoup as bs
url='https://www.etuovi.com/myytavat-asunnot/oulu?haku=M1582971026&sivu=1000'
page=requests.get(url)
soup = bs(page.content,'html.parser')
last_page = None
pages = []
buttons=soup.find_all('button', class_= "Pagination__button__3H2wX")
for button in buttons:
pages.append(button.text)
print(pages)
This works just fine.
Butt when I try to combine this with my original code, which also works by itself I run into error:
Traceback (most recent call last):
File "C:/Users/Käyttäjä/PycharmProjects/Etuoviscaper/etuovi.py", line 29, in <module>
containers = page_soup.find("div", {"class": "ListPage__cardContainer__39dKQ"})
File "C:\Users\Käyttäjä\PycharmProjects\Etuoviscaper\venv\lib\site-packages\bs4\element.py", line 2173, in __getattr__
raise AttributeError(
AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
This is the error I get.
Any ideas how to get this work? Thanks
import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import re
import requests
my_url = 'https://www.etuovi.com/myytavat-asunnot/oulu?haku=M1582971026&sivu=1'
filename = "asunnot.csv"
f = open(filename, "w")
headers = "Neliöt; Hinta; Osoite; Kaupunginosa; Kaupunki; Huoneistoselitelmä; Rakennusvuosi\n"
f.write(headers)
page = requests.get(my_url)
soup = soup(page.content, 'html.parser')
pages = []
buttons = soup.findAll("button", {"class": "Pagination__button__3H2wX"})
for button in buttons:
pages.append(button.text)
last_page = int(pages[-1])
for sivu in range(1, last_page):
req = requests.get(my_url + str(sivu))
page_soup = soup(req.text, "html.parser")
containers = page_soup.findAll("div", {"class": "ListPage__cardContainer__39dKQ"})
for container in containers:
size_list = container.find("div", {"class": "flexboxgrid__col-xs__26GXk flexboxgrid__col-md-4__2DYW-"}).text
size_number = re.findall("\d+\,*\d+", size_list)
size = ''.join(size_number) # Asunnon koko neliöinä
prize_line = container.find("div", {"class": "flexboxgrid__col-xs-5__1-5sb flexboxgrid__col-md-4__2DYW-"}).text
prize_number_list = re.findall("\d+\d+", prize_line)
prize = ''.join(prize_number_list[:2]) # Asunnon hinta
address_city = container.h4.text
address_list = address_city.split(', ')[0:1]
address = ' '.join(address_list) # osoite
city_part = address_city.split(', ')[-2] # kaupunginosa
city = address_city.split(', ')[-1] # kaupunki
type_org = container.h5.text
type = type_org.replace("|", "").replace(",", "").replace(".", "") # asuntotyyppi
year_list = container.find("div", {"class": "flexboxgrid__col-xs-3__3Kf8r flexboxgrid__col-md-4__2DYW-"}).text
year_number = re.findall("\d+", year_list)
year = ' '.join(year_number)
print("pinta-ala: " + size)
print("hinta: " + prize)
print("osoite: " + address)
print("kaupunginosa: " + city_part)
print("kaupunki: " + city)
print("huoneistoselittelmä: " + type)
print("rakennusvuosi: " + year)
f.write(size + ";" + prize + ";" + address + ";" + city_part + ";" + city + ";" + type + ";" + year + "\n")
f.close()
Your main problem has to do with the way you use soup. You first import BeautifulSoup as soup - and then you override this name, when you create your first BeautifulSoup-instance:
soup = soup(page.content, 'html.parser')
From this point on soup will no longer be the name library BeautifulSoup, but the object you just created. Hence, when you some lines further down try to create a new instance (page_soup = soup(req.text, "html.parser")) this fails as soup no longer refers to BeautifulSoup.
So the best thing would be importing the library correctly like so: from bs4 import BeautifulSoup (or import AND use it as bs - like Ricco D did), and then change the two instantiating lines like so:
soup = BeautifulSoup(page.content, 'html.parser') # this is Python2.7-syntax btw
and
page_soup = BeautifulSoup(req.text, "html.parser") # this is Python3-syntax btw
If you're on Python3, the proper requests-syntax would by page.text and not page.content as .content returns bytes in Python3, which is not what you want (as BeautifulSoup needs a str). If you're on Python2.7 you should probably change req.text to req.content.
Good luck.
Finding your element with class name doesn't seem to be the best idea..because of this. Same class name for all the next elements.
I don't know what you are looking for exactly because of the language. I suggest..you go to the website>press f12>press ctrl+f>type the xpath..See what elements you get.If you don't know about xpaths read this. https://blog.scrapinghub.com/2016/10/27/an-introduction-to-xpath-with-examples

Web crawler not able to process more than one webpage

I am trying to extract some information about mtg cards from a webpage with the following program but I repeatedly retrieve information about the initial page given(InitUrl). The crawler is unable to proceed further. I have started to believe that i am not using the correct urls or maybe there is a restriction in using urllib that slipped my attention. Here is the code that i struggle with for weeks now:
import re
from math import ceil
from urllib.request import urlopen as uReq, Request
from bs4 import BeautifulSoup as soup
InitUrl = "https://mtgsingles.gr/search?q=dragon"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 4 # depth of pages to be retrieved
query = InitUrl.split("?")[1]
for i in range(0, NumOfPages):
if i == 0:
Url = InitUrl
else:
Url = URL_Next
print(Url)
UClient = uReq(Url) # downloading the url
page_html = UClient.read()
UClient.close()
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
print(card_name + "\n" + cardP_T + "\n" + cardType + "\n")
try:
URL_Next = InitUrl + "&page=" + str(i + 2)
print("The next URL is: " + URL_Next + "\n")
except IndexError:
print("Crawling process completed! No more infomation to retrieve!")
else:
NumOfCrawledPages += 1
Url = URL_Next
finally:
print("Moving to page : " + str(NumOfCrawledPages + 1) + "\n")
One of the reasons your code fail is, that you don't use cookies. The site seem to require these to allow paging.
A clean and simple way of extracting the data you're interested in would be like this:
import requests
from bs4 import BeautifulSoup
# the site actually uses this url under the hood for paging - check out Google Dev Tools
paging_url = "https://mtgsingles.gr/search?ajax=products-listing&lang=en&page={}&q=dragon"
return_list = []
# the page-scroll will only work when we support cookies
# so we fetch the page in a session
session = requests.Session()
session.get("https://mtgsingles.gr/")
All pages have a next button except the last one. So we use this knowledge to loop until the next-button goes away. When it does - meaning that the last page is reached - the button is replaced with a 'li'-tag with the class of 'next hidden'. This only exists on the last page
Now we're ready to start looping
page = 1 # set count for start page
keep_paging = True # use flag to end loop when last page is reached
while keep_paging:
print("[*] Extracting data for page {}".format(page))
r = session.get(paging_url.format(page))
soup = BeautifulSoup(r.text, "html.parser")
items = soup.select('.iso-item.item-row-view.clearfix')
for item in items:
name = item.find('div', class_='col-md-10').get_text().strip().split('\xa0')[0]
toughness_element = item.find('div', class_='card-power-toughness')
try:
toughness = toughness_element.get_text().strip()
except:
toughness = None
cardtype = item.find('div', class_='cardtype').get_text()
card_dict = {
"name": name,
"toughness": toughness,
"cardtype": cardtype
}
return_list.append(card_dict)
if soup.select('li.next.hidden'): # this element only exists if the last page is reached
keep_paging = False
print("[*] Scraper is done. Quitting...")
else:
page += 1
# do stuff with your list of dicts - e.g. load it into pandas and save it to a spreadsheet
This will scroll until no more pages exists - no matter how many subpages would be in the site.
My point in the comment above was merely that if you encounter an Exception in your code, your pagecount would never increase. That's probably not what you want to do, which is why I recommended you to learn a little more about the behaviour of the whole try-except-else-finally deal.
I am also bluffed, by the request given the same reply, ignoring the page parameter. As a dirty soulution I can offer you first to set up the page-size to a high enough number to get all the Items that you want (this parameter works for some reason...)
import re
from math import ceil
import requests
from bs4 import BeautifulSoup as soup
InitUrl = Url = "https://mtgsingles.gr/search"
NumOfCrawledPages = 0
URL_Next = ""
NumOfPages = 2 # depth of pages to be retrieved
query = "dragon"
cardSet=set()
for i in range(1, NumOfPages):
page_html = requests.get(InitUrl,params={"page":i,"q":query,"page-size":999})
print(page_html.url)
page_soup = soup(page_html.text, "html.parser")
cards = page_soup.findAll("div", {"class": ["iso-item", "item-row-view"]})
for card in cards:
card_name = card.div.div.strong.span.contents[3].contents[0].replace("\xa0 ", "")
if len(card.div.contents) > 3:
cardP_T = card.div.contents[3].contents[1].text.replace("\n", "").strip()
else:
cardP_T = "Does not exist"
cardType = card.contents[3].text
cardString=card_name + "\n" + cardP_T + "\n" + cardType + "\n"
cardSet.add(cardString)
print(cardString)
NumOfCrawledPages += 1
print("Moving to page : " + str(NumOfCrawledPages + 1) + " with " +str(len(cards)) +"(cards)\n")

beautifulSoup does not match chrome inspect while Python webscraping

I am currently trying to webscrape protein sequences off of the ncbi protein database. At this point, the user can search for a protein and I can get the link to the first result that the database spits out. However, when I run this through beautiful soup, the soup does not match the chrome inspect element, nor does it have the sequence at all.
Here is my current code:
import string
import requests
from bs4 import BeautifulSoup
def getSequence():
searchProt = input("Enter a Protein Name!:")
if searchProt != '':
searchString = "https://www.ncbi.nlm.nih.gov/protein/?term=" + searchProt
page = requests.get(searchString)
soup = BeautifulSoup(page.text, 'html.parser')
soup = str(soup)
accIndex = soup.find("a")
accessionStart = soup.find('<dd>',accIndex)
accessionEnd = soup.find('</dd>', accessionStart + 4)
accession = soup[accessionStart + 4: accessionEnd]
newSearchString = "https://www.ncbi.nlm.nih.gov/protein/" + accession
try:
newPage = requests.get(newSearchString)
#This is where it fails
newSoup = BeautifulSoup(newPage.text, 'html.parser')
aaList = []
spaceCount = newSoup.count("ff_line")
print(spaceCount)
for i in range(spaceCount):
startIndex = newSoup.find("ff_line")
startIndex = newSoup.find(">", startIndex) + 2
nextAA = newSoup[startIndex]
while nextAA in string.ascii_lowercase:
aaList.append(nextAA)
startIndex += 1
nextAA = newSoup[startIndex]
return aaList
except:
print("Please Enter a Valid Protein")
I have been trying to run it with the search 'p53' and have gotten to the link: here
I have looked at a long series of webscraping entries on this website and tried a lot of things including installing selenium and using different parsers. I am still confused about why these don't match. (Sorry if this is a repeat question, I am very new to webscraping and currently have a concussion so I am looking for a bit of individual case feedback)
This code will extract the protein sequence you want using Selenium. I've modified your original code to give you the result you wanted.
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
driver = webdriver.Firefox()
def getSequence():
searchProt = input("Enter a Protein Name!:")
if searchProt != '':
searchString = "https://www.ncbi.nlm.nih.gov/protein/?term=" + searchProt
page = requests.get(searchString)
soup = BeautifulSoup(page.text, 'html.parser')
soup = str(soup)
accIndex = soup.find("a")
accessionStart = soup.find('<dd>',accIndex)
accessionEnd = soup.find('</dd>', accessionStart + 4)
accession = soup[accessionStart + 4: accessionEnd]
newSearchString = "https://www.ncbi.nlm.nih.gov/protein/" + accession
try:
driver.get(newSearchString)
html = driver.page_source
newSoup = BeautifulSoup(html, "lxml")
ff_tags = newSoup.find_all(class_="ff_line")
aaList = []
for tag in ff_tags:
aaList.append(tag.text.strip().replace(" ",""))
protSeq = "".join(aaList)
return protSeq
except:
print("Please Enter a Valid Protein")
sequence = getSequence()
print(sequence)
Which produces the following output for input of "p53":
meepqsdlsielplsqetfsdlwkllppnnvlstlpssdsieelflsenvtgwledsggalqgvaaaaastaedpvtetpapvasapatpwplsssvpsyktfqgdygfrlgflhsgtaksvtctyspslnklfcqlaktcpvqlwvnstpppgtrvramaiykklqymtevvrrcphherssegdslappqhlirvegnlhaeylddkqtfrhsvvvpyeppevgsdcttihynymcnsscmggmnrrpiltiitledpsgnllgrnsfevricacpgrdrrteeknfqkkgepcpelppksakralptntssspppkkktldgeyftlkirgherfkmfqelnealelkdaqaskgsedngahssylkskkgqsasrlkklmikregpdsd

How to detect a strong tag and add a "*" to each?

I have this code in python, and what it does to me is to stretch from a web. The text content of the articles of the web, and save them in different files. I would like to know, how to detect a strong tag and in each one of them add a " " before or after.
This is the result that I need:
import urllib2
import re
from bs4 import BeautifulSoup
import time
def _remove_attrs(soup):
for tag in soup.findAll(True):
href=''
if (tag.has_attr('href')):
href=tag.get('href')
src=''
if (tag.has_attr('src')):
src=tag.get('src')
# tag.attrs = None
tag.attrs = {}
if (href!=''):
tag['href']= href
if (src!=''):
tag['src']= src
return soup
def _remove_empty(soup):
return soup
for x in soup.find_all():
if len(x.text) == 0:
x.extract()
return soup
base_url= 'http://www.scavonehnos.com.py/index.php?
mact=Vmcs,cntnt01,print,0&cntnt01articleid='
for x in range(10,12):
n_url=base_url + str(x)
print ("#PAGINA: "+n_url)
page = urllib2.urlopen(n_url)
soup = BeautifulSoup(page, 'html.parser')
contenido=(soup.div.get_text())
file = open('vicentec/prod_'+str(x)+'.txt', 'w')
file.write(u' '.strip(contenido).join((contenido)).encode('utf-
8'))
file.close()
time.sleep(5)
As you will see I want to add the asterisk to the <strong> tag on the web.
For those who visited this question this case I already solved it and it stayed and it works perfectly
import urllib2
import re
from bs4 import BeautifulSoup
import time
def _remove_attrs(soup):
for tag in soup.findAll(True):
href=''
if (tag.has_attr('href')):
href=tag.get('href')
src=''
if (tag.has_attr('src')):
src=tag.get('src')
# tag.attrs = None
tag.attrs = {}
if (href!=''):
tag['href']= href
if (src!=''):
tag['src']= src
return soup
def _remove_empty(soup):
return soup
for x in soup.find_all(''):
if len(x.text) == 0:
x.extract()
return soup
base_url= 'http://www.scavonehnos.com.py/index.php?mact=Vmcs,cntnt01,print,0&cntnt01articleid='
for x in range(10,225):
n_url=base_url + str(x)
print ("#PAGINA: "+n_url)
page = urllib2.urlopen(n_url)
soup = BeautifulSoup(page, 'html.parser')
for strong in soup.select('strong'):
strong.replace_with('#'+strong.get_text())
contenido=(soup.div.get_text())
fprod = 'vicentec/prod_'+(str(x))+'.txt'
file = open(fprod, "w")
file.write(u' '.strip(contenido).join((contenido)).encode('utf-8'))
file.close()

Scrape page with generator

I scraping a site with Beautiful Soup. The problem I have is that certain parts of the site are paginated with JS, with an unknown (varying) number of pages to scrape.
I'm trying to get around this with a generator, but it's my first time writing one and I'm having a hard time wrapping my head around it and figuring out if what I'm doing makes sense.
Code:
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
tlds = csv.reader(open("top_level_domains.csv", 'r'), delimiter=';')
sites = csv.writer(open("websites_to_scrape.csv", "w"), delimiter=',')
tld = "uz"
has_next = True
page = 0
def create_link(tld, page):
if page == 0:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain"
else:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain/page/" + repr(page)
return link
def check_for_next(soup):
disabled_nav = soup.find(class_="pagingDivDisabled")
if disabled_nav:
if "Next" in disabled_nav:
return False
else:
return True
else:
return True
def make_soup(link):
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
return soup
def all_the_pages(counter):
while True:
link = create_link(tld, counter)
soup = make_soup(link)
if check_for_next(soup) == True:
yield counter
else:
break
counter += 1
def scrape_page(soup):
table = soup.find('table', {'class': 'rankTable'})
th = table.find('tbody')
test = th.find_all("td")
correct_cells = range(1,len(test),3)
for cell in correct_cells:
#print test[cell]
url = repr(test[cell])
content = re.sub("<[^>]*>", "", url)
sites.writerow([tld]+[content])
def main():
for page in all_the_pages(0):
print page
link = create_link(tld, page)
print link
soup = make_soup(link)
scrape_page(soup)
main()
My thinking behind the code:
The scraper should get the page, determine if there is another page that follows, scrape the current page and move to the next one, repreating the process. If there is no next page, it should stop. Does that make sense how I'm going it here?
As I told you, you could use selenium for programmatically clicking on the Next button, but since that is not an option for you, I can think of the following method to get the number of pages using pure BS4:
import requests
from bs4 import BeautifulSoup
def page_count():
pages = 1
url = "https://domaintyper.com/top-websites/most-popular-websites-with-uz-domain/page/{}"
while True:
html = requests.get(url.format(pages)).content
soup = BeautifulSoup(html)
table = soup.find('table', {'class': 'rankTable'})
if len(table.find_all('tr')) <= 1:
return pages
pages += 1

Categories

Resources