beautifulsoup and request.post - python

I practice scraping one site.
I got some mysterious situation.
import requests
from bs4 import BeautifulSoup
import json
class n_auction(object):
def __init__(self):
self.search_request = {
'lawsup':0,
'lesson':0,
'next_biddate1':'',
'next_biddate2':'',
'state':91,
'b_count1':0,
'b_count2':0,
'b_area1':'',
'b_area2':'',
'special':0,
'e_area1':'',
'e_area2':'',
'si':11,
'gu':0,
'dong':0,
'apt_no':0,
'order':'',
'start':60,
'total_record_val':850,
'detail_search':'',
'detail_class':'',
'recieveCode':'',}
self.headers = {'User-Agent':'Mozilla/5.0',
'Referer':'http://goodauction.land.naver.com/auction/ca_list.php'}
def scrape(self, max_pages):
addr = []
pageno = 0
self.search_request['start'] = pageno
while pageno < max_pages:
payload = json.dumps(self.search_request)
r = requests.post('http://goodauction.land.naver.com/auction/ax_list.php', data=payload ,headers=self.headers)
print(r.text)
s = BeautifulSoup(r.text)
print(s)
if __name__ == '__main__':
scraper = n_auction()
scraper.scrape(30)
when I print(r.text), I got full text.like below picture.
But after passing through beautifulsoup,
I lost some values like below picture.
It's very embarrassing. Help me~~

Switching the parser from the default, lxml, to html.parser worked for me.
Try: s = BeautifulSoup(r.text, 'html.parser')

Related

I'm not able to split my code into functions

I made a code to download pdfs from a website, and it works perfectly, downloading all the PDF's (first code below). However, when I split my code into functions, only two links are inserted into the "papers" list and the execution ends with code zero, but the following warning message appears:
GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html.parser"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.
The code that caused this warning is on line 11 of the file C:\Downloads\EditoraCL\download_pdf.py. To get rid of this warning, pass the additional argument 'features="html.parser"' to the BeautifulSoup constructor.
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
FIRST CODE:
import requests
import httplib2
import os
from bs4 import BeautifulSoup, SoupStrainer
papers = []
pdfs = []
http = httplib2.Http()
status, response = http.request('https://www.snh2021.anpuh.org/site/anais')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
papers.append(link['href'])
print(papers)
for x in papers:
if x.endswith('pdf'):
pdfs.append(x)
print(pdfs)
def baixa_arquivo(url, endereco):
resposta = requests.get(url)
if resposta.status_code == requests.codes.OK:
with open(endereco, 'wb') as novo_arquivo:
novo_arquivo.write(resposta.content)
print('Download concluído. Salvo em {}'.format(endereco))
else:
resposta.raise_for_status()
if __name__ == '__main__':
url_basica = 'https://www.snh2021.anpuh.org/{}'
output = 'Download'
for i in range(1, len(pdfs)):
nome_do_arquivo = os.path.join(output, 'artigo{}.pdf'.format(i))
a = pdfs[i]
z = url_basica.format(a)
y = requests.get(z)
if y.status_code!=404:
baixa_arquivo(z, nome_do_arquivo)
CODE DIVIDED INTO FUNCTIONS:
import requests
import httplib2
import os
from bs4 import BeautifulSoup, SoupStrainer
papers = []
pdfs = []
def busca_links():
http = httplib2.Http()
status, response = http.request('https://www.snh2021.anpuh.org/site/anais')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
papers.append(link['href'])
return papers
def links_pdf():
for x in papers:
if x.endswith('pdf'):
pdfs.append(x)
return pdfs
def baixa_arquivo(url, endereco):
resposta = requests.get(url)
if resposta.status_code == requests.codes.OK:
with open(endereco, 'wb') as novo_arquivo:
novo_arquivo.write(resposta.content)
return f'Download concluído. Salvo em {endereco}'
else:
resposta.raise_for_status()
if __name__ == '__main__':
busca_links()
links_pdf()
url_basica = 'https://www.snh2021.anpuh.org/{}'
output = 'Download'
print(papers)
print(pdfs)
for i in range(1, len(pdfs)):
nome_do_arquivo = os.path.join(output, 'artigo{}.pdf'.format(i))
a = pdfs[i]
z = url_basica.format(a)
y = requests.get(z)
if y.status_code!=404:
baixa_arquivo(z, nome_do_arquivo)
Could someone help me understand why the second code is giving this error?
Functions do not share their inner variables, so in order to make your code work, you should assign "papers" to the function itself, after returning it inside the function ( papers = busca_links() and links_pdf(papers) ).
Anyway, for the purpose of organization and clearer code, you should use classes and methods:
import os
import requests
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
class Pdf:
def __init__(self, base_url, url):
self.main_dir = os.path.dirname(__file__)
self.pdfs_dir = os.path.join(self.main_dir, 'pdfs')
self.base_url = base_url
self.url = url
def get_links(self):
http = httplib2.Http()
status, response = http.request(self.url)
self.links = []
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if link['href'].endswith('pdf'):
self.links.append(f"{self.base_url}{link['href']}")
def download_pdf(self):
for link in self.links:
response = requests.get(link, stream=True)
if response.status_code == 200:
file_path = os.path.join(self.pdfs_dir, link.split('/')[-1])
with open(file_path, 'wb') as f:
f.write(response.content)
print('Success. Saved on {}'.format(file_path))
else:
# Should handle errors here, by appending them to a list and
# trying again later.
print('Error.')
if __name__ == '__main__':
base_url = 'https://www.snh2021.anpuh.org/'
url = f'{base_url}site/anais'
pdf = Pdf(base_url, url)
pdf.get_links()
pdf.download_pdf()

webscraping bus stops with beautifulsoup

I am trying to web scrape bus stop names for a given line, here is an example page for line 212 https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212. I want to have as an output two lists, one with bus stop names in one direction and the other list with another direction. (It's clearly seen on the web page). I managed to get all names in one list with
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
print(soup.prettify())
all_bus_stops = []
table = soup.find_all('a')
for element in table:
if element.get_text() in all_bus_stops:
continue
else:
all_bus_stops.append(element.get_text())
return all_bus_stops
print(download_bus_schedule('212'))
I guess the solution would be to somehow divide the soup into two parts.
You can use the bs4.element.Tag.findAll method:
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
all_bus_stops = []
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html.parser')
for s in soup.select(".holo-list"):
bus_stops = []
for f in s.findAll("li"):
if f.text not in bus_stops:
bus_stops.append(f.text)
all_bus_stops.append(bus_stops)
return all_bus_stops
print(download_bus_schedule('212'))
Output:
[['Pl.Hallera', 'Pl.Hallera', 'Darwina', 'Namysłowska', 'Rondo Żaba', 'Rogowska', 'Kołowa', 'Dks Targówek', 'Metro Targówek Mieszkaniowy', 'Myszkowska', 'Handlowa', 'Metro Trocka', 'Bieżuńska', 'Jórskiego', 'Łokietka', 'Samarytanka', 'Rolanda', 'Żuromińska', 'Targówek-Ratusz', 'Św.Wincentego', 'Malborska', 'Ch Targówek'],
['Ch Targówek', 'Ch Targówek', 'Malborska', 'Św.Wincentego', 'Targówek-Ratusz', 'Żuromińska', 'Gilarska', 'Rolanda', 'Samarytanka', 'Łokietka', 'Jórskiego', 'Bieżuńska', 'Metro Trocka', 'Metro Trocka', 'Metro Trocka', 'Handlowa', 'Myszkowska', 'Metro Targówek Mieszkaniowy', 'Dks Targówek', 'Kołowa', 'Rogowska', 'Rondo Żaba', '11 Listopada', 'Bródnowska', 'Szymanowskiego', 'Pl.Hallera', 'Pl.Hallera']]
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
bus_stops_1 = []
bus_stops_2 = []
directions = soup.find_all("ul", {"class":"holo-list"})
for stop in directions[0].find_all("a"):
if stop not in bus_stops_1:
bus_stops_1.append(stop.text.strip())
for stop in directions[1].find_all("a"):
if stop not in bus_stops_2:
bus_stops_2.append(stop.text.strip())
all_bus_stops = (bus_stops_1, bus_stops_2)
return all_bus_stops
print(download_bus_schedule('212')[0])
print(download_bus_schedule('212')[1])
I may have misunderstood as I do not know Polish but see if this helps.
from bs4 import BeautifulSoup
import requests
url = 'https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212'
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "html.parser")
d = {}
for h2 in soup.select('h2.holo-divider'):
d[h2.text] = []
ul = h2.next_sibling
for li in ul.select('li'):
if li.a.text not in d[h2.text]:
d[h2.text].append(li.a.text)
from pprint import pprint
pprint(d)
As all stops are encapsulated in the next un-ordered list, you could use the find_next function of bs4.
e.g.
URL = f"http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l={bus_number}"
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
directions = ["Ch Targówek","Pl.Hallera"]
result = {}
for direction in directions:
header = soup.find(text=direction)
list = header.find_next("ul")
stops_names = [stop.get_text() for stop in list]
result[direction] = stops_names
return result
Plus you might want to use f-string to format your strings as it improves reading and is less error prone.

find data dictionary behind URL

Find data dictionary behind URL
https://www.coingecko.com/fr/pi%C3%A8ces/1/markets_tab --> BTC
https://www.coingecko.com/fr/pi%C3%A8ces/2/markets_tab --> LTC
https://www.coingecko.com/fr/pi%C3%A8ces/3/markets_tab --> AUR
https://www.coingecko.com/fr/pi%C3%A8ces/?/markets_tab --> ?
https://www.coingecko.com/fr/pi%C3%A8ces/100/markets_tab --> XLM
from bs4 import BeautifulSoup
from time import sleep
import requests
i = 0
while(True):
try:
if i == 0:
url = "https://www.coingecko.com/fr/pi%C3%A8ces/1/markets_tab"
else:
url = "https://www.coingecko.com/fr/pi%C3%A8ces/{}/markets_tab".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
print(url)
sleep(2)
i += 2
except:
break
I want to scan all numbers from 1 to 100 in order to find the associate coin using python.
IDK what you are looking for. your question is unclear. anyway using the following code, you should be able to loop and then you can do what you want.
import requests
from bs4 import BeautifulSoup
with requests.Session() as req:
for item in range(1, 101):
r = req.get(f"https://www.coingecko.com/fr/pi%C3%A8ces/{}/markets_tab")
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
# Do whatever.

I am using BeautifulSoup, how can I get the link after the redirect?

I want to get the link after the redirect of the download link in the article page.
For example:
https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/
In the above article page, there are the following download links:
https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/yz5cw79mbn3a/ECNHOgoNYk0MIkEoFlUkFlY5Vj5WVSRQACVKfx8EOw8ReVs+FFs=
Open this link directly, it will not redirect to the real download link, you need to open it in the article page.
# coding=utf-8
import lxml
import re
import requests
import sys
from bs4 import BeautifulSoup
from urllib.request import urlopen
def urlopen(url):
'''
using requests to replace urllib.requests.urlopen
return an html
'''
headers = {"User-Agent":"Mozilla/5.0"}
r = requests.get(url, headers=headers)
return r.text
def generate_pages(subTitle,fromPage,toPage):
'''
return page sites' url list
'''
pages = []
if(fromPage > 0 and fromPage<toPage):
for i in range(fromPage,toPage+1):
pages.append('https://scanlibs.com/category/books'+subTitle+'/page/'+str(i))
return pages
def get_book_sites_of_one_page(page):
'''
get book site's url in one page
input: page site url
output: book site urls list
return book sites in one page
'''
html = urlopen(page)
soup = BeautifulSoup(html,'html.parser')
linkList = soup.find('main').findAll('a',{'rel':'bookmark'})
bookSites= []
for link in linkList[::2]:
if 'href' in link.attrs:
#print(link)
bookSites.append(link.attrs['href'])
return bookSites
def get_book_urls(bookSite):
'''
input a book site
find book downloading urls in this book site
then
return them as a list
'''
bookURLs=[]
html = urlopen(bookSite)
soup = BeautifulSoup(html,'lxml')
linkList = soup.findAll("a",{"target":"_blank"})
for link in linkList[::2]:
# print(link)
if 'href' in link.attrs:
bookURLs.append(link.attrs['href'])
return bookURLs
def get_all_book_urls(fromPage=1, toPage=1, subTitle=''):
bookSites = []
bookURLs = []
pages = generate_pages(subTitle,fromPage, toPage)
for page in pages:
bookSiteOfOnePage=get_book_sites_of_one_page(page)
bookSites.extend(bookSiteOfOnePage)
for bookSite in bookSites:
book_urls=get_book_urls(bookSite)
bookURLs += book_urls
for bookURL in bookURLs:
print(bookURL)
#with open(filename, 'w') as f:
# f.write(bookURLs)
def main():
if(len(sys.argv) == 4):
'''
python getUrl.py 1, 100, programming
from page 1 to page in subject programming
'''
subTitle = str(sys.argv[3])
fromPage = int(sys.argv[1])
toPage = int(sys.argv[2])
get_all_book_urls(fromPage, toPage, subTitle)
if(len(sys.argv) == 3):
'''
python getUrl.py 1 100
from page 1 to page 100
'''
subTitle = ''
fromPage = int(sys.argv[1])
toPage = int(sys.argv[2])
#filename = subTitle="-"+str(pageNum)+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
elif(len(sys.argv) == 2):
'''
python getUrl.py 10
from page 10 to page 10
only download books on page 10
'''
fromPage = int(sys.argv[1])
toPage = fromPage + 1
subTitle = ''
#filename = "All-"+str(pageNum)+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
elif(len(sys.argv)== 1):
fromPage = 1
# custom page range
toPage = 2
subTitle = ''
#filename = "All-"+"1"+"-"+time.strftime('%Y-%m-%d', time.localtime())+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
else:
print("Error, too many arguments")
if __name__ == '__main__':
#filename = ''
main()
Thank you for your help!
This website checks if the referer is set while redirecting. You can just give the original url as referer in the header and easily bypass this. You can also see that the referer is used as a url parameter in the final download link.
import requests
from bs4 import BeautifulSoup
s = requests.Session()
url='https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/'
r=html=s.get(url).text
soup=BeautifulSoup(html,'html.parser')
relative_link=soup.find('a',{'id':'download'})['href'] #get the relative link
download_redirect_link=url+relative_link
headers={
"referer": url
}
r2=requests.get(download_redirect_link,headers=headers)
print(r2.url)
Output
https://rapidgator.net/file/80e881f7631eddb49de31e5718eb96ba?referer=https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/

How to get favicon by using beautiful soup and python

I wrote some stupid code for learning just, but it doesn't work for any sites.
here is the code:
import urllib2, re
from BeautifulSoup import BeautifulSoup as Soup
class Founder:
def Find_all_links(self, url):
page_source = urllib2.urlopen(url)
a = page_source.read()
soup = Soup(a)
a = soup.findAll(href=re.compile(r'/.a\w+'))
return a
def Find_shortcut_icon (self, url):
a = self.Find_all_links(url)
b = ''
for i in a:
strre=re.compile('shortcut icon', re.IGNORECASE)
m=strre.search(str(i))
if m:
b = i["href"]
return b
def Save_icon(self, url):
url = self.Find_shortcut_icon(url)
print url
host = re.search(r'[0-9a-zA-Z]{1,20}\.[a-zA-Z]{2,4}', url).group()
opener = urllib2.build_opener()
icon = opener.open(url).read()
file = open(host+'.ico', "wb")
file.write(icon)
file.close()
print '%s icon successfully saved' % host
c = Founder()
print c.Save_icon('http://lala.ru')
The most strange thing is it works for site:
http://habrahabr.ru
http://5pd.ru
But doesn't work for most others that I've checked.
You're making it far more complicated than it needs to be. Here's a simple way to do it:
import urllib
page = urllib.urlopen("http://5pd.ru/")
soup = BeautifulSoup(page)
icon_link = soup.find("link", rel="shortcut icon")
icon = urllib.urlopen(icon_link['href'])
with open("test.ico", "wb") as f:
f.write(icon.read())
Thomas K's answer got me started in the right direction, but I found some websites that didn't say rel="shortcut icon", like 1800contacts.com that says just rel="icon". This works in Python 3 and returns the link. You can write that to file if you want.
from bs4 import BeautifulSoup
import requests
def getFavicon(domain):
if 'http' not in domain:
domain = 'http://' + domain
page = requests.get(domain)
soup = BeautifulSoup(page.text, features="lxml")
icon_link = soup.find("link", rel="shortcut icon")
if icon_link is None:
icon_link = soup.find("link", rel="icon")
if icon_link is None:
return domain + '/favicon.ico'
return icon_link["href"]
In case anyone wants to use a single check with regex, the following works for me:
import re
from bs4 import BeautifulSoup
html_code = "<Some HTML code you get from somewhere>"
soup = BeautifulSoup(html_code, features="lxml")
for item in soup.find_all('link', attrs={'rel': re.compile("^(shortcut icon|icon)$", re.I)}):
print(item.get('href'))
This will also account for occurrences of case sensitivity.
Thank you, kurd. Here is the code with some changes:
import urllib2
from BeautifulSoup import BeautifulSoup
url = "http://www.facebook.com"
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
icon_link = soup.find("link", rel="shortcut icon")
try:
icon = urllib2.urlopen(icon_link['href'])
except:
icon = urllib2.urlopen(url + icon_link['href'])
iconname = url.split(r'/')
iconname = iconname[2].split('.')
iconname = iconname[1] + '.' + iconname[2] + '.ico'
with open(iconname, "wb") as f:
f.write(icon.read())
Thank you, Thomas.
Here is the code wiith some changes:
import urllib2
from BeautifulSoup import BeautifulSoup
page = urllib2.urlopen("http://5pd.ru/")
soup = BeautifulSoup(page.read())
icon_link = soup.find("link", rel="shortcut icon")
icon = urllib2.urlopen(icon_link['href'])
with open("test.ico", "wb") as f:
f.write(icon.read())

Categories

Resources