from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import requests
url = 'https://en.wikisource.org/wiki/Main_Page'
r = requests.get(url)
Soup = BeautifulSoup(r.text, "html5lib")
List = Soup.find("div",class_="enws-mainpage-widget-content", id="enws-mainpage-newtexts-content").find_all('a')
ebooks=[]
i=0
for ebook in List:
x=ebook.get('title')
for ch in x:
if(ch==":"):
x=""
if x!="":
ebooks.append(x)
i=i+1
inputnumber=0
while inputnumber<len(ebooks):
print(inputnumber+1, " - ", ebooks[inputnumber])
inputnumber=inputnumber+1
input=int(input("Please select a book: "))
selectedbook = Soup.find("a", title=ebooks[input-1])
print(selectedbook['title'])
url1 = "https://en.wikisource.org/"+selectedbook['href']
print(url1)
r1 = requests.get(url1)
Soup1 = BeautifulSoup(r1.text, "html5lib")
List1 = Soup.find("div", class_="prp-pages-output")
print(List1)
This is my code. I want to get the paragraghs in the html code at the last part. But as output I get:
1 - The Center of the Web
2 - Bobby Bumps Starts a Lodge
3 - May (Mácha)
4 - Animal Life and the World of Nature/1903/06/Notes and Comments
5 - The Czechoslovak Review/Volume 2/No Compromise
6 - She's All the World to Me
7 - Their One Love
Please select a book: 4
Animal Life and the World of Nature/1903/06/Notes and Comments
https://en.wikisource.org//wiki/Animal_Life_and_the_World_of_Nature/1903/06/Notes_and_Comments
None
Why is the List1 returns as one? It shouldn't. Can someone tell me where I am doing wrong.
guess you just typo the Soup1 with Soup. + I think you would need more then only one when you are looking for list of items so I added the find_all() function.
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import requests
url = "https://en.wikisource.org/wiki/Main_Page"
r = requests.get(url)
Soup = BeautifulSoup(r.text, "html5lib")
List = Soup.find(
"div", class_="enws-mainpage-widget-content", id="enws-mainpage-newtexts-content"
).find_all("a")
ebooks = []
i = 0
for ebook in List:
x = ebook.get("title")
for ch in x:
if ch == ":":
x = ""
if x != "":
ebooks.append(x)
i = i + 1
inputnumber = 0
while inputnumber < len(ebooks):
print(inputnumber + 1, " - ", ebooks[inputnumber])
inputnumber = inputnumber + 1
input = int(input("Please select a book: "))
selectedbook = Soup.find("a", title=ebooks[input - 1])
print(selectedbook["title"])
url1 = "https://en.wikisource.org/" + selectedbook["href"]
print(url1)
r1 = requests.get(url1)
Soup1 = BeautifulSoup(r1.text, "html5lib")
List1 = Soup1.find_all("div", class_="prp-pages-output")
print(List1)
Related
I am trying to web scrape bus stop names for a given line, here is an example page for line 212 https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212. I want to have as an output two lists, one with bus stop names in one direction and the other list with another direction. (It's clearly seen on the web page). I managed to get all names in one list with
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
print(soup.prettify())
all_bus_stops = []
table = soup.find_all('a')
for element in table:
if element.get_text() in all_bus_stops:
continue
else:
all_bus_stops.append(element.get_text())
return all_bus_stops
print(download_bus_schedule('212'))
I guess the solution would be to somehow divide the soup into two parts.
You can use the bs4.element.Tag.findAll method:
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
all_bus_stops = []
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html.parser')
for s in soup.select(".holo-list"):
bus_stops = []
for f in s.findAll("li"):
if f.text not in bus_stops:
bus_stops.append(f.text)
all_bus_stops.append(bus_stops)
return all_bus_stops
print(download_bus_schedule('212'))
Output:
[['Pl.Hallera', 'Pl.Hallera', 'Darwina', 'Namysłowska', 'Rondo Żaba', 'Rogowska', 'Kołowa', 'Dks Targówek', 'Metro Targówek Mieszkaniowy', 'Myszkowska', 'Handlowa', 'Metro Trocka', 'Bieżuńska', 'Jórskiego', 'Łokietka', 'Samarytanka', 'Rolanda', 'Żuromińska', 'Targówek-Ratusz', 'Św.Wincentego', 'Malborska', 'Ch Targówek'],
['Ch Targówek', 'Ch Targówek', 'Malborska', 'Św.Wincentego', 'Targówek-Ratusz', 'Żuromińska', 'Gilarska', 'Rolanda', 'Samarytanka', 'Łokietka', 'Jórskiego', 'Bieżuńska', 'Metro Trocka', 'Metro Trocka', 'Metro Trocka', 'Handlowa', 'Myszkowska', 'Metro Targówek Mieszkaniowy', 'Dks Targówek', 'Kołowa', 'Rogowska', 'Rondo Żaba', '11 Listopada', 'Bródnowska', 'Szymanowskiego', 'Pl.Hallera', 'Pl.Hallera']]
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
bus_stops_1 = []
bus_stops_2 = []
directions = soup.find_all("ul", {"class":"holo-list"})
for stop in directions[0].find_all("a"):
if stop not in bus_stops_1:
bus_stops_1.append(stop.text.strip())
for stop in directions[1].find_all("a"):
if stop not in bus_stops_2:
bus_stops_2.append(stop.text.strip())
all_bus_stops = (bus_stops_1, bus_stops_2)
return all_bus_stops
print(download_bus_schedule('212')[0])
print(download_bus_schedule('212')[1])
I may have misunderstood as I do not know Polish but see if this helps.
from bs4 import BeautifulSoup
import requests
url = 'https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212'
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "html.parser")
d = {}
for h2 in soup.select('h2.holo-divider'):
d[h2.text] = []
ul = h2.next_sibling
for li in ul.select('li'):
if li.a.text not in d[h2.text]:
d[h2.text].append(li.a.text)
from pprint import pprint
pprint(d)
As all stops are encapsulated in the next un-ordered list, you could use the find_next function of bs4.
e.g.
URL = f"http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l={bus_number}"
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
directions = ["Ch Targówek","Pl.Hallera"]
result = {}
for direction in directions:
header = soup.find(text=direction)
list = header.find_next("ul")
stops_names = [stop.get_text() for stop in list]
result[direction] = stops_names
return result
Plus you might want to use f-string to format your strings as it improves reading and is less error prone.
import ssl
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
context = ssl._create_unverified_context()
def PriceOfLegos(Site):
price = []
title = []
LegoWebsite = Site
uLegoWebsite = uReq(LegoWebsite, context=context)
LegoWebsiteHTML = uLegoWebsite.read()
uLegoWebsite.close()
LegoWebsiteSoup = soup(LegoWebsiteHTML, "html.parser")
for x in LegoWebsiteSoup.find_all("span", {"class": "visuallyhidden"}):
text = x.get_text()
if text[0] == "$":
price.append(text[1:])
for x in LegoWebsiteSoup.find_all("a", {"class": "product-title-link line-clamp line-clamp-2"}):
title_text = x.get_text()
title.append(title_text)
for x in price:
print("$", x, sep="")
z = PriceOfLegos("https://www.walmart.com/search/?query=Lego%20horse")
print(z)
The scraping works when the code is not a function and the LegoWebsite = a URL. The only problem is I want it to be more dynamic so i can any enter any Search URL on Walmart and it will display the price. The problem I'm facing is when I run this my output is "None".
variable text never gets created cos is out of the loop
if text[0] == "$":
price.append(text[1:])
and when you printing out your price list is empty since nothing gets appended
cos the:
if text[0] == "$":
never gets True cos text variable dose not exists for if statement
try this:
import ssl
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
context = ssl._create_unverified_context()
def PriceOfLegos(Site):
price = []
title = []
LegoWebsite = Site
uLegoWebsite = uReq(LegoWebsite, context=context)
LegoWebsiteHTML = uLegoWebsite.read()
uLegoWebsite.close()
LegoWebsiteSoup = soup(LegoWebsiteHTML, "html.parser")
for x in LegoWebsiteSoup.find_all("span", {"class": "visuallyhidden"}):
text = x.get_text()
if text[0] == "$":
price.append(text[1:])
for x in LegoWebsiteSoup.find_all("a", {"class": "product-title-link line-clamp line-clamp-2"}):
title_text = x.get_text()
title.append(title_text)
for x in price:
print("$", x, sep="")
PriceOfLegos("https://www.walmart.com/search/?query=Lego%20horse")
I am trying to crawl sites's text. But It's only crawling 12 articles.
I don't know why does it do like that. and I wondering If I wanna crawl other pages, What should I do?
import requests
from bs4 import BeautifulSoup
x = int(input("start page:"))
while x < int(input("end page:")):
x = x + 1
url = "https://www.mmtimes.com/national-news.html?page=" + str(x)
result = requests.get(url)
bs_obj = BeautifulSoup(result.content, "html.parser")
content = bs_obj.find("div", {"class": "msp-three-col"})
read_more = content.findAll("div", {"class": "read-more"})
for item in read_more:
atag = item.find('a')
link = "https://www.mmtimes.com" + atag["href"]
linkResult = requests.get(link)
subpage = BeautifulSoup(linkResult.content, "html.parser")
fnresult = subpage.find("div", {"class": "field-item even"})
print(fnresult.text)
print("Total "+str(len(read_more))+" articles"))
Check out the below code, I have made some changes. this will result the required output.
import requests
from bs4 import BeautifulSoup
x = int(input("start page:"))
y = input("end page:")
article_count = 0
while x <= int(y):
url = "https://www.mmtimes.com/national-news.html?page=" + str(x)
result = requests.get(url)
bs_obj = BeautifulSoup(result.content, "html.parser")
content = bs_obj.find("div", {"class": "msp-three-col"})
read_more = content.findAll("div", {"class": "read-more"})
for item in read_more:
atag = item.find('a')
link = "https://www.mmtimes.com" + atag["href"]
linkResult = requests.get(link)
subpage = BeautifulSoup(linkResult.content, "html.parser")
fnresult = subpage.find("div", {"class": "field-item even"})
print(fnresult.text)
article_count += len(read_more)
print("Total "+str(article_count)+" articles")
x += 1
I am currently trying to webscrape protein sequences off of the ncbi protein database. At this point, the user can search for a protein and I can get the link to the first result that the database spits out. However, when I run this through beautiful soup, the soup does not match the chrome inspect element, nor does it have the sequence at all.
Here is my current code:
import string
import requests
from bs4 import BeautifulSoup
def getSequence():
searchProt = input("Enter a Protein Name!:")
if searchProt != '':
searchString = "https://www.ncbi.nlm.nih.gov/protein/?term=" + searchProt
page = requests.get(searchString)
soup = BeautifulSoup(page.text, 'html.parser')
soup = str(soup)
accIndex = soup.find("a")
accessionStart = soup.find('<dd>',accIndex)
accessionEnd = soup.find('</dd>', accessionStart + 4)
accession = soup[accessionStart + 4: accessionEnd]
newSearchString = "https://www.ncbi.nlm.nih.gov/protein/" + accession
try:
newPage = requests.get(newSearchString)
#This is where it fails
newSoup = BeautifulSoup(newPage.text, 'html.parser')
aaList = []
spaceCount = newSoup.count("ff_line")
print(spaceCount)
for i in range(spaceCount):
startIndex = newSoup.find("ff_line")
startIndex = newSoup.find(">", startIndex) + 2
nextAA = newSoup[startIndex]
while nextAA in string.ascii_lowercase:
aaList.append(nextAA)
startIndex += 1
nextAA = newSoup[startIndex]
return aaList
except:
print("Please Enter a Valid Protein")
I have been trying to run it with the search 'p53' and have gotten to the link: here
I have looked at a long series of webscraping entries on this website and tried a lot of things including installing selenium and using different parsers. I am still confused about why these don't match. (Sorry if this is a repeat question, I am very new to webscraping and currently have a concussion so I am looking for a bit of individual case feedback)
This code will extract the protein sequence you want using Selenium. I've modified your original code to give you the result you wanted.
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
driver = webdriver.Firefox()
def getSequence():
searchProt = input("Enter a Protein Name!:")
if searchProt != '':
searchString = "https://www.ncbi.nlm.nih.gov/protein/?term=" + searchProt
page = requests.get(searchString)
soup = BeautifulSoup(page.text, 'html.parser')
soup = str(soup)
accIndex = soup.find("a")
accessionStart = soup.find('<dd>',accIndex)
accessionEnd = soup.find('</dd>', accessionStart + 4)
accession = soup[accessionStart + 4: accessionEnd]
newSearchString = "https://www.ncbi.nlm.nih.gov/protein/" + accession
try:
driver.get(newSearchString)
html = driver.page_source
newSoup = BeautifulSoup(html, "lxml")
ff_tags = newSoup.find_all(class_="ff_line")
aaList = []
for tag in ff_tags:
aaList.append(tag.text.strip().replace(" ",""))
protSeq = "".join(aaList)
return protSeq
except:
print("Please Enter a Valid Protein")
sequence = getSequence()
print(sequence)
Which produces the following output for input of "p53":
meepqsdlsielplsqetfsdlwkllppnnvlstlpssdsieelflsenvtgwledsggalqgvaaaaastaedpvtetpapvasapatpwplsssvpsyktfqgdygfrlgflhsgtaksvtctyspslnklfcqlaktcpvqlwvnstpppgtrvramaiykklqymtevvrrcphherssegdslappqhlirvegnlhaeylddkqtfrhsvvvpyeppevgsdcttihynymcnsscmggmnrrpiltiitledpsgnllgrnsfevricacpgrdrrteeknfqkkgepcpelppksakralptntssspppkkktldgeyftlkirgherfkmfqelnealelkdaqaskgsedngahssylkskkgqsasrlkklmikregpdsd
I was making a checker for opinions on a website and when going trough it, if they both matched, the text would print twice and when it didn't match it would simply print once, I've been trying to figure out how to simply print the true matches and to only print them once.
The CMD output looks like this:
http://prntscr.com/h3ioli
import cfscrape, re, os, time
from bs4 import BeautifulSoup
cc = open('cookie.txt').read()
mybbuser, sid = cc.split(':')
MainScrapper = cfscrape.create_scraper()
def substring_after(string, delim,back):
return string.partition(delim)[back]
suspect = raw_input('User ID: ')
def reputationCheck(userid):
reputationlist = []
r = MainScrapper.get('https://v3rmillion.net/reputation.php?uid={}&show=positive'.format(userid), cookies={'mybbuser': mybbuser,'sid': sid})
soup = BeautifulSoup(r.text, 'html.parser')
reputations = soup.find_all('a', href=re.compile("member\.php\?action=profile\&uid=(\d+)"))
for reputation in reputations:
reputationlist = reputationlist + [substring_after(reputation['href'],'uid=', 2)]
if soup.find('span', {'class' : 'pages'}):
pages = soup.find('span', {'class' : 'pages'}).text
pages = substring_after(pages, '(', 2)
pages = substring_after(pages, '):', 0)
soup = BeautifulSoup(r.text, 'html.parser')
for x in range(1, (int(pages))):
r = MainScrapper.get('https://v3rmillion.net/reputation.php?uid={}'.format(userid) + '&show=positive&page={}'.format(x + 1), cookies={'mybbuser': mybbuser,'sid': sid})
soup = BeautifulSoup(r.text, 'html.parser')
reputations = soup.find_all('a', href=re.compile("member\.php\?action=profile\&uid=(\d+)"))
for reputation in reputations:
if not reputation == suspect:
reputationlist = reputationlist + [substring_after(reputation['href'],'uid=', 2)]
for userids in reputationlist:
if not str(userids) == str(suspect):
victim = []
r = MainScrapper.get('https://v3rmillion.net/reputation.php?uid={}'.format(userids) + '&show=positive', cookies={'mybbuser': mybbuser,'sid': sid})
soup = BeautifulSoup(r.text, 'html.parser')
reputations = soup.find_all('a', href=re.compile("member\.php\?action=profile\&uid=(\d+)"))
for reputation in reputations:
if substring_after(reputation['href'],'uid=', 2) == str(suspect):
print(str(userids) + 'exchanged reputation with ' + str(suspect))
else:
pass
if not reputation == suspect:
if not str(userids) == str(suspect):
These should be:
if reputation != suspect:
if str(userids) != str(suspect):
Maybe, you should put your print function outside the loop.
Something like:
a=['x','y','z']
c=''
for b in a:
c+=b
print('this is inside loop, create multiple print: '+c)
print('this is outside loop, create single print, get it: '+c)