BeautifulSoup Python3 append multiple links output to single list - python

import requests
from bs4 import BeautifulSoup
import re
links = ["https://bitcointalk.org/index.php?board=159.0",
"https://bitcointalk.org/index.php?board=159.40",
"https://bitcointalk.org/index.php?board=159.80"]
def get_span():
for url in links:
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")
t1 = str(soup.findAll("span", id=re.compile('^msg_')))
print(t1)
t2 = [x for x in re.findall(r'\d+\.\d+', t1)]
t2.sort(key=float, reverse=True)
t3 = "https://bitcointalk.org/index.php?topic"
for hn in t2:
if len(hn) >= 9:
hn = '{}={}'.format(t3, hn)
print(hn)
get_span()
Hello!
My code iterates items in link, then finds the span with
id=msg_, then finds all numbers in id=msg_, sorts them in
descending order. Problem is that it iterates first item and gives output
of it, then second item and so on, so out put contains 3 lists. So it
sorts items separately.. I want to get a output with all 3 items from links
sorted in one list.

You can use list.extend to add items to list and then sort the final list before returning it.
For example:
import re
import requests
from bs4 import BeautifulSoup
links = ["https://bitcointalk.org/index.php?board=159.0",
"https://bitcointalk.org/index.php?board=159.40",
"https://bitcointalk.org/index.php?board=159.80"]
def get_span(links):
rv = []
r = re.compile(r'\d{7,}\.\d+')
for url in links:
soup = BeautifulSoup(requests.get(url).content, "html.parser")
rv.extend(a['href'] for a in soup.select('span[id^="msg_"] > a') if r.search(a['href']))
return sorted(rv, key=lambda k: float(r.search(k).group(0)), reverse=True)
all_links = get_span(links)
# print links on screen:
for link in all_links:
print(link)
Prints:
https://bitcointalk.org/index.php?topic=5255494.0
https://bitcointalk.org/index.php?topic=5255416.0
https://bitcointalk.org/index.php?topic=5255389.0
https://bitcointalk.org/index.php?topic=5255376.0
https://bitcointalk.org/index.php?topic=5255316.0
https://bitcointalk.org/index.php?topic=5254720.0
https://bitcointalk.org/index.php?topic=5254480.0
https://bitcointalk.org/index.php?topic=5254448.0
https://bitcointalk.org/index.php?topic=5254287.0
https://bitcointalk.org/index.php?topic=5252504.0
https://bitcointalk.org/index.php?topic=5251621.0
https://bitcointalk.org/index.php?topic=5250998.0
https://bitcointalk.org/index.php?topic=5250388.0
https://bitcointalk.org/index.php?topic=5250185.0
https://bitcointalk.org/index.php?topic=5248406.0
https://bitcointalk.org/index.php?topic=5247112.0
... and so on.
EDIT: If you want to show link text n
ext to url, you can use this example:
import re
import requests
from bs4 import BeautifulSoup
links = ["https://bitcointalk.org/index.php?board=159.0",
"https://bitcointalk.org/index.php?board=159.40",
"https://bitcointalk.org/index.php?board=159.80"]
def get_span(links):
rv = []
r = re.compile(r'\d{7,}\.\d+')
for url in links:
soup = BeautifulSoup(requests.get(url).content, "html.parser")
rv.extend((a['href'], a.text) for a in soup.select('span[id^="msg_"] > a') if r.search(a['href']))
return sorted(rv, key=lambda k: float(r.search(k[0]).group(0)), reverse=True)
all_links = get_span(links)
# print links on screen:
for link, text in all_links:
print('{} {}'.format(link, text))
Prints:
https://bitcointalk.org/index.php?topic=5255494.0 NUL Token - A new hyper-deflationary experiment! Airdrop!
https://bitcointalk.org/index.php?topic=5255416.0 KEEP NETWORK - A privacy layer for Ethereum
https://bitcointalk.org/index.php?topic=5255389.0 [ANN] ICO - OBLICHAIN | Blockchain technology at the service of creative genius
https://bitcointalk.org/index.php?topic=5255376.0 UniChain - The 4th Generation Blockchain Made For The Smart Society 5.0
https://bitcointalk.org/index.php?topic=5255316.0 INFINITE RICKS ! First Multiverse Cryptocurrency ! PoS 307%
https://bitcointalk.org/index.php?topic=5254720.0 [GMC] GameCredits - Unofficial & Unmoderated for Censored Posts.
https://bitcointalk.org/index.php?topic=5254480.0 [ANN] [BTCV] Bitcoin VaultA higher standard in security
https://bitcointalk.org/index.php?topic=5254448.0 [ANN] Silvering (SLVG) token - New Silver Asset Backed Cryptocurrency
... and so on.

Related

webscraping bus stops with beautifulsoup

I am trying to web scrape bus stop names for a given line, here is an example page for line 212 https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212. I want to have as an output two lists, one with bus stop names in one direction and the other list with another direction. (It's clearly seen on the web page). I managed to get all names in one list with
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
print(soup.prettify())
all_bus_stops = []
table = soup.find_all('a')
for element in table:
if element.get_text() in all_bus_stops:
continue
else:
all_bus_stops.append(element.get_text())
return all_bus_stops
print(download_bus_schedule('212'))
I guess the solution would be to somehow divide the soup into two parts.
You can use the bs4.element.Tag.findAll method:
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
all_bus_stops = []
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html.parser')
for s in soup.select(".holo-list"):
bus_stops = []
for f in s.findAll("li"):
if f.text not in bus_stops:
bus_stops.append(f.text)
all_bus_stops.append(bus_stops)
return all_bus_stops
print(download_bus_schedule('212'))
Output:
[['Pl.Hallera', 'Pl.Hallera', 'Darwina', 'Namysłowska', 'Rondo Żaba', 'Rogowska', 'Kołowa', 'Dks Targówek', 'Metro Targówek Mieszkaniowy', 'Myszkowska', 'Handlowa', 'Metro Trocka', 'Bieżuńska', 'Jórskiego', 'Łokietka', 'Samarytanka', 'Rolanda', 'Żuromińska', 'Targówek-Ratusz', 'Św.Wincentego', 'Malborska', 'Ch Targówek'],
['Ch Targówek', 'Ch Targówek', 'Malborska', 'Św.Wincentego', 'Targówek-Ratusz', 'Żuromińska', 'Gilarska', 'Rolanda', 'Samarytanka', 'Łokietka', 'Jórskiego', 'Bieżuńska', 'Metro Trocka', 'Metro Trocka', 'Metro Trocka', 'Handlowa', 'Myszkowska', 'Metro Targówek Mieszkaniowy', 'Dks Targówek', 'Kołowa', 'Rogowska', 'Rondo Żaba', '11 Listopada', 'Bródnowska', 'Szymanowskiego', 'Pl.Hallera', 'Pl.Hallera']]
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
bus_stops_1 = []
bus_stops_2 = []
directions = soup.find_all("ul", {"class":"holo-list"})
for stop in directions[0].find_all("a"):
if stop not in bus_stops_1:
bus_stops_1.append(stop.text.strip())
for stop in directions[1].find_all("a"):
if stop not in bus_stops_2:
bus_stops_2.append(stop.text.strip())
all_bus_stops = (bus_stops_1, bus_stops_2)
return all_bus_stops
print(download_bus_schedule('212')[0])
print(download_bus_schedule('212')[1])
I may have misunderstood as I do not know Polish but see if this helps.
from bs4 import BeautifulSoup
import requests
url = 'https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212'
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "html.parser")
d = {}
for h2 in soup.select('h2.holo-divider'):
d[h2.text] = []
ul = h2.next_sibling
for li in ul.select('li'):
if li.a.text not in d[h2.text]:
d[h2.text].append(li.a.text)
from pprint import pprint
pprint(d)
As all stops are encapsulated in the next un-ordered list, you could use the find_next function of bs4.
e.g.
URL = f"http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l={bus_number}"
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
directions = ["Ch Targówek","Pl.Hallera"]
result = {}
for direction in directions:
header = soup.find(text=direction)
list = header.find_next("ul")
stops_names = [stop.get_text() for stop in list]
result[direction] = stops_names
return result
Plus you might want to use f-string to format your strings as it improves reading and is less error prone.

How can I get data from a website using BeautifulSoup and requests?

I am a beginner in web scraping, and I need help with this problem.
The website, allrecipes.com, is a website where you can find recipes based on a search, which in this case is 'pie':
link to the html file:
'view-source:https://www.allrecipes.com/search/results/?wt=pie&sort=re'
(right click-> view page source)
I want to create a program that takes a input, searches it up on allrecipes, and returns a list with tuples of the first five recipes with data such as the time that takes to make, serving yield, ingrediants, and more.
This is my program so far:
import requests
from bs4 import BeautifulSoup
def searchdata():
inp=input('what recipe would you like to search')
url ='http://www.allrecipes.com/search/results/?wt='+str(inp)+'&sort=re'
r=requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links=[]
#fill in code for finding top 3 or five links
for i in range(3)
a = requests.get(links[i])
soupa = BeautifulSoup(a.text, 'html.parser')
#fill in code to find name, ingrediants, time, and serving size with data from soupa
names=[]
time=[]
servings=[]
ratings=[]
ingrediants=[]
searchdata()
Yes, i know, my code is very messy but What should I fill in in the two code fill-in areas?
Thanks
After searching for the recipe you have to get the links of each recipe and then request again for each of those links, because the information you're looking for is not available on the search page. That would not look clean without OOP so here's the class I wrote that does what you want.
import requests
from time import sleep
from bs4 import BeautifulSoup
class Scraper:
links = []
names = []
def get_url(self, url):
url = requests.get(url)
self.soup = BeautifulSoup(url.content, 'html.parser')
def print_info(self, name):
self.get_url(f'https://www.allrecipes.com/search/results/?wt={name}&sort=re')
if self.soup.find('span', class_='subtext').text.strip()[0] == '0':
print(f'No recipes found for {name}')
return
results = self.soup.find('section', id='fixedGridSection')
articles = results.find_all('article')
texts = []
for article in articles:
txt = article.find('h3', class_='fixed-recipe-card__h3')
if txt:
if len(texts) < 5:
texts.append(txt)
else:
break
self.links = [txt.a['href'] for txt in texts]
self.names = [txt.a.span.text for txt in texts]
self.get_data()
def get_data(self):
for i, link in enumerate(self.links):
self.get_url(link)
print('-' * 4 + self.names[i] + '-' * 4)
info_names = [div.text.strip() for div in self.soup.find_all(
'div', class_='recipe-meta-item-header')]
ingredient_spans = self.soup.find_all('span', class_='ingredients-item-name')
ingredients = [span.text.strip() for span in ingredient_spans]
for i, div in enumerate(self.soup.find_all('div', class_='recipe-meta-item-body')):
print(info_names[i].capitalize(), div.text.strip())
print()
print('Ingredients'.center(len(ingredients[0]), ' '))
print('\n'.join(ingredients))
print()
print('*' * 50, end='\n\n')
chrome = Scraper()
chrome.print_info(input('What recipe would you like to search: '))

Duplicates links in python

Good morning world
I'm new to python and trying out things. I'm trying to remove duplicate links from the below run.
currently their are 253 links that were retrieved. Can someone please help me with this?
import requests
from bs4 import BeautifulSoup
import csv
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
links = soup.find_all("a")
print ('Number of links retrieved: ', len (links))
Convert it to a set and it will remove duplicates:
links = set(soup.find_all("a"))
out:
Number of links retrieved: 244
set will not care about the sort order.
Therefor i used a list with cleaning the href correctly.
Now the len is 123
from bs4 import BeautifulSoup
import requests
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
soup = BeautifulSoup(r.text, 'html.parser')
links = []
for item in soup.findAll("a", href=True):
item = item.get("href")
if item.startswith("h"):
pass
else:
item = f"https://www.census.gov/{item}"
if item not in links:
links.append(item)
print(item)
print(len(links))
Output:
https://www.census.gov/#content
https://www.census.gov/en.html
https://www.census.gov/topics/population/age-and-sex.html
https://www.census.gov/businessandeconomy
https://www.census.gov/topics/education.html
https://www.census.gov/topics/preparedness.html
https://www.census.gov/topics/employment.html
https://www.census.gov/topics/families.html
https://www.census.gov/topics/population/migration.html
https://www.census.gov/geo
https://www.census.gov/topics/health.html
https://www.census.gov/topics/population/hispanic-origin.html
https://www.census.gov/topics/housing.html
https://www.census.gov/topics/income-poverty.html
https://www.census.gov/topics/international-trade.html
https://www.census.gov/topics/population.html
https://www.census.gov/topics/population/population-estimates.html
https://www.census.gov/topics/public-sector.html
https://www.census.gov/topics/population/race.html
https://www.census.gov/topics/research.html
https://www.census.gov/topics/public-sector/voting.html
https://www.census.gov/about/index.html
https://www.census.gov/data
https://www.census.gov/academy
https://www.census.gov/about/what/admin-data.html
https://www.census.gov/data/data-tools.html
https://www.census.gov/developers/
https://www.census.gov/data/experimental-data-products.html
https://www.census.gov/data/related-sites.html
https://www.census.gov/data/software.html
https://www.census.gov/data/tables.html
https://www.census.gov/data/training-workshops.html
https://www.census.gov/library/visualizations.html
https://www.census.gov/library.html
https://www.census.gov/AmericaCounts
https://www.census.gov/library/audio.html
https://www.census.gov/library/fact-sheets.html
https://www.census.gov/library/photos.html
https://www.census.gov/library/publications.html
https://www.census.gov/library/video.html
https://www.census.gov/library/working-papers.html
https://www.census.gov/programs-surveys/are-you-in-a-survey.html
https://www.census.gov/programs-surveys/decennial-census/2020census-redirect.html
https://www.census.gov/2020census
https://www.census.gov/programs-surveys/acs
https://www.census.gov/programs-surveys/ahs.html
https://www.census.gov/programs-surveys/abs.html
https://www.census.gov/programs-surveys/asm.html
https://www.census.gov/programs-surveys/cog.html
https://www.census.gov/programs-surveys/cbp.html
https://www.census.gov/programs-surveys/cps.html
https://www.census.gov/EconomicCensus
https://www.census.gov/internationalprograms
https://www.census.gov/programs-surveys/metro-micro.html
https://www.census.gov/popest
https://www.census.gov/programs-surveys/popproj.html
https://www.census.gov/programs-surveys/saipe.html
https://www.census.gov/programs-surveys/susb.html
https://www.census.gov/programs-surveys/sbo.html
https://www.census.gov/sipp/
https://www.census.gov/programs-surveys/surveys-programs.html
https://www.census.gov/newsroom.html
https://www.census.gov/partners
https://www.census.gov/programs-surveys/sis.html
https://www.census.gov/NAICS
https://www.census.gov/library/reference/code-lists/schedule/b.html
https://www.census.gov/data/developers/data-sets/Geocoding-services.html
https://www.census.gov/about-us
https://www.census.gov/about/who.html
https://www.census.gov/about/what.html
https://www.census.gov/about/business-opportunities.html
https://www.census.gov/careers
https://www.census.gov/fieldjobs
https://www.census.gov/about/history.html
https://www.census.gov/about/policies.html
https://www.census.gov/privacy
https://www.census.gov/regions
https://www.census.gov/about/contact-us/staff-finder.html
https://www.census.gov/about/contact-us.html
https://www.census.gov/about/faqs.html
https://www.commerce.gov/
https://www.census.gov//en.html
https://www.census.gov//programs-surveys.html
https://www.census.gov//popest
https://www.census.gov//programs-surveys/popest/about.html
https://www.census.gov//programs-surveys/popest/data.html
https://www.census.gov//programs-surveys/popest/geographies.html
https://www.census.gov//programs-surveys/popest/guidance.html
https://www.census.gov//programs-surveys/popest/guidance-geographies.html
https://www.census.gov//programs-surveys/popest/library.html
https://www.census.gov//programs-surveys/popest/news.html
https://www.census.gov//programs-surveys/popest/technical-documentation.html
https://www.census.gov//programs-surveys/popest/data/tables.html
https://www.census.gov//programs-surveys/popest/about/schedule.html
https://www.census.gov//newsroom/press-releases/2019/popest-nation.html
https://www.census.gov//newsroom/press-releases/2019/popest-nation/popest-nation-spanish.html
https://www.census.gov//newsroom/press-releases/2019/new-years-2020.html
https://www.census.gov//data/tables/time-series/demo/popest/pre-1980-national.html
https://www.census.gov//data/tables/time-series/demo/popest/pre-1980-state.html
https://www.census.gov//data/tables/time-series/demo/popest/pre-1980-county.html
https://www.census.gov//library/publications/2015/demo/p25-1142.html
https://www.census.gov//library/publications/2010/demo/p25-1139.html
https://www.census.gov//library/publications/2010/demo/p25-1138.html
https://www.census.gov//programs-surveys/popest/library/publications.html
https://www.census.gov//library/visualizations/2020/comm/superbowl.html
https://www.census.gov//library/visualizations/2019/comm/slower-growth-nations-pop.html
https://www.census.gov//library/visualizations/2019/comm/happy-new-year-2020.html
https://www.census.gov//programs-surveys/popest/library/visualizations.html
https://www.census.gov/#
https://www.census.gov/#uscb-nav-skip-header
https://www.census.gov/newsroom/blogs.html
https://www.census.gov/newsroom/stories.html
https://www.facebook.com/uscensusbureau
https://twitter.com/uscensusbureau
https://www.linkedin.com/company/us-census-bureau
https://www.youtube.com/user/uscensusbureau
https://www.instagram.com/uscensusbureau/
https://www.census.gov/quality/
https://www.census.gov/datalinkage
https://www.census.gov/about/policies/privacy/privacy-policy.html#accessibility
https://www.census.gov/foia
https://www.usa.gov/
https://www.census.gov//
123

Web Scraping loop structure issues

I'm currently writing some code to web scrape from AutoTrader as a practice project. I'm having trouble printing the results I need.
The desired output should be:
Car 1
Specs Car 1
Instead, it's
Car 1
Specs Car 1
Specs Car 2
Specs Car X
car 2
Where in my looping structure am I going wrong?
from bs4 import BeautifulSoup
import requests
page_link = ("https://www.autotrader.co.uk/car-search?sort=price-asc&radius=1500&postcode=lu15jq&onesearchad=Used&onesearchad=Nearly%20New&onesearchad=New&make=AUDI&model=A5&price-to=8500&year-from=2008&maximum-mileage=90000&transmission=Automatic&exclude-writeoff-categories=on")
LN = 0
r = requests.get(page_link)
c = r.content
soup = BeautifulSoup(c,"html.parser")
all = soup.find_all("h2",{"class":"listing-title title-wrap"})
all2 = soup.find_all('ul',{"class" :'listing-key-specs '})
The above block of code is fine. The below block prints the outputs.
LN = -1
ListTotal = len(all)
for item in all:
if LN <= ListTotal:
LN += 1
print(item.find("a", {"class": "js-click-handler listing-fpa-link"}).text)
for carspecs in all2:
print (carspecs.text)
else:
break
Thanks
Because you're printing every carspec in all2 every time
all = ...
all2 = ...
for item in all:
...
for carspecs in all2:
# will print everything in all2 on each iteration of all
print (carspecs.text)
I suspect you want
for item, specs in zip(all, all2):
...
print(specs.text)
Just FYI I cleaned up you code with better logic and names, got rid of superfluous stuff and made it obey the python style guide
import requests
from bs4 import BeautifulSoup
page_link = ("https://www.autotrader.co.uk/car-search?sort=price-asc&"
"radius=1500&postcode=lu15jq&onesearchad=Used&"
"onesearchad=Nearly%20New&onesearchad=New&make=AUDI&model=A5"
"&price-to=8500&year-from=2008&maximum-mileage=90000"
"&transmission=Automatic&exclude-writeoff-categories=on")
request = requests.get(page_link)
conn = request.content
soup = BeautifulSoup(conn, "html.parser")
# don't overload the inbuilt `all`
cars = soup.find_all("h2", {"class":"listing-title title-wrap"})
cars_specs = soup.find_all('ul', {"class" :'listing-key-specs '})
for car, specs in zip(cars, cars_specs):
# your logic with regards to the `LN` variable did absolutely nothing
print(car.find("a", {"class": "js-click-handler listing-fpa-link"}))
print(specs.text)

Python index function

I am writing a simple Python program which grabs a webpage and finds all the URL links in it. However I try to index the starting and ending delimiter (") of each href link but the ending one always indexed wrong.
# open a url and find all the links in it
import urllib2
url=urllib2.urlopen('right.html')
urlinfo = url.info()
urlcontent = url.read()
bodystart = urlcontent.index('<body')
print 'body starts at',bodystart
bodycontent = urlcontent[bodystart:].lower()
print bodycontent
linklist = []
n = bodycontent.index('<a href=')
while n:
print n
bodycontent = bodycontent[n:]
a = bodycontent.index('"')
b = bodycontent[(a+1):].index('"')
print a, b
linklist.append(bodycontent[(a+1):b])
n = bodycontent[b:].index('<a href=')
print linklist
I would suggest using a html parsing library instead of manually searching the DOM String.
Beautiful Soup is an excellent library for this purpose. Here is the reference link
With bs your link searching functionality could look like:
from bs4 import BeautifulSoup
soup = BeautifulSoup(bodycontent, 'html.parser')
linklist = [a.get('href') for a in soup.find_all('a')]

Categories

Resources