python unicode char from requests/bs4

python unicode char from requests/bs4 - python

i have a script to get the lyrics of a song from metrolyrics using requests and bs4
the problem is that when i print it it show something like this (part of the lyrics)
Rabbi, Papa, Allah, Lama, Imam, Bibbia, Dharma, Sura, Torah, Pane, Vino, KashÃ¨r, á¸¤alÄl, Yom Kippur, Quaresima, Ramadan
when it should look like this
Rabbi, Papa, Lama, Imam, Bibbia, Dharma, Sura, Torah, Pane, vino, kashèr, ḥalāl, Yom Kippur, Quaresima, Ramadan
code i use
import requests
from bs4 import BeautifulSoup
import os
try:
from urllib.parse import quote_plus
except ImportError:
from urllib import quote_plus
def get_lyrics(song_name):
song_name += ' metrolyrics'
name = quote_plus(song_name)
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11'
'(KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
url = 'http://www.google.com/search?q=' + name
result = requests.get(url, headers=hdr).text
link_start = result.find('http://www.metrolyrics.com')
if(link_start == -1):
return("Lyrics not found on Metrolyrics")
link_end = result.find('html', link_start + 1)
link = result[link_start:link_end + 4]
lyrics_html = requests.get(link, headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel'
'Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/55.0.2883.95 Safari/537.36'
}
).text
soup = BeautifulSoup(lyrics_html, "lxml")
raw_lyrics = (soup.findAll('p', attrs={'class': 'verse'}))
paras = []
try:
final_lyrics = unicode.join(u'\n', map(unicode, raw_lyrics))
except NameError:
final_lyrics = str.join(u'\n', map(str, raw_lyrics))
final_lyrics = (final_lyrics.replace('<p class="verse">', '\n'))
final_lyrics = (final_lyrics.replace('<br/>', ' '))
final_lyrics = final_lyrics.replace('</p>', ' ')
return (final_lyrics)
i have tried with .encode('utf-8') .encode('unicode-escape') and the reconverting again but no solution
i have another script where i use musixmatch api and there it show the unicode correct

I did small changes in get_lyrics function:
return final_lyrics.encode('latin1').decode('utf-8')
and got desired output:
# python2
print get_lyrics('kashèr')
...
Rabbi, Papa, Allah, Lama, Imam, Bibbia, Dharma, Sura, Torah, Pane, Vino, Kashèr, Ḥalāl, Yom Kippur, Quaresima, Ramadan
...

Related

download a pdf from a website and change title - python and curl

I have a python script to download pdf's from a ASP-site. I would like to save the pdf file using the name it is displayed on the website. So from this line of html, get the link to download the pdf and get the name how it is displayed. So for the following html line:
Chapter 3 - Weird science</li>
get the link https://www.ib3.nl/curriculum/engels\100 TB 3 Ch 3.pdf
and save this pdf as Chapter 3 - Weird science.pdf
below is the script to get all the pdf's
from bs4 import BeautifulSoup as BeautifulSoup
import urllib.request as requests
from urllib import parse as urlparse
import requests
import os
klassen = ['1e klas']
vakken = ['Wiskunde']
'''['Engels','Aardrijkskunde','Economie', 'Filosofie','Frans', 'Geschiedenis', \
'Nask', 'Natuurkunde', 'Nederlands', 'Scheikunde', 'Spaans', 'Wiskunde'\
'Biologie', 'Duits', 'Grieks','Latijn','Leesmateriaal', \
'Loopbaanorientatie','NLT']'''
links = []
for klas in klassen:
for vak in vakken:
url = "https://www.svpo.nl/curriculum.asp"
payload = 'vak='+ vak + '&klas_en_schoolsoort='+klas
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
response = requests.post(url, data=payload, headers=headers)
path_out = 'c:\books\\'
path = (path_out + klas + "\\" + vak + "\\")
if not(os.path.exists(path)): os.makedirs(path)
links = BeautifulSoup(response.text, "lxml")#.find_all('a')
a=BeautifulSoup(response.text, "lxml").find_all('a')
for link in BeautifulSoup(response.text, "lxml").find_all('a'):
current_link = link.get('href')
if str(link.get('href')) != 'None':
if current_link.endswith('pdf'):
print(current_link)
links.append(current_link)
filename = current_link[current_link.find('\\')+1:]
filename_url = urlparse.quote(filename)
path_url = current_link[:current_link.find('\\')] + '/' + filename_url
os.system('Curl -o "' + path + filename + '" ' + path_url)

Simply:
filename = link.text + '.pdf'
That's all.
My version with changes from comments:
import os
import requests
from bs4 import BeautifulSoup
from urllib import parse as urlparse
klassen = ['1e klas']
vakken = ['Wiskunde']
'''['Engels','Aardrijkskunde','Economie', 'Filosofie','Frans', 'Geschiedenis', \
'Nask', 'Natuurkunde', 'Nederlands', 'Scheikunde', 'Spaans', 'Wiskunde'\
'Biologie', 'Duits', 'Grieks','Latijn','Leesmateriaal', \
'Loopbaanorientatie','NLT']'''
links = []
url = "https://www.svpo.nl/curriculum.asp"
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
path_out = r'c:\books'
for klas in klassen:
for vak in vakken:
path = os.path.join(path_out, klas, vak)
os.makedirs(path, exist_ok=True)
payload = {'vak': vak, 'klas_en_schoolsoort': klas}
response = requests.post(url, data=payload, headers=headers)
all_links = BeautifulSoup(response.text, "lxml").find_all('a', {'href': True})
for link in all_links:
url = link.get('href')
if url.lower().endswith('.pdf'):
url = url.replace('\\', '/')
links.append(url)
print('url:', url)
#filename = url.split('\\')[-1]
filename = link.text + '.pdf'
print('filename:', filename)
full_path = os.path.join(path, filename)
print('full_path:', full_path)
response = requests.get(url)
with open(full_path, 'wb') as fh:
fh.write(response.content)
print('---')

Save .csv file with names in first column and links in second column (IndentationError: expected an indented block)

When trying to run the script the following error appears:
line 16
for tag in jogos:
^
IndentationError: expected an indented block
My expected result is:
COLUMN 1 COLUMN 2
Team A v Team B LINK HREF
Team C v Team D LINK HREF
Team E v Team F LINK HREF
Team G v Team H LINK HREF
Another problem is that specifying data delivery with namelist and linkslist is only delivering the first value and not all possible values.
In this answer (https://stackoverflow.com/a/68446386/11462274), via print, like this:
print(tag.find("a", href=True).get_text().strip())
Or this:
print(tag.find("a", href=True)["href"])
The result delivers the complete list of values, but when I define a specific name like namelist and linkslist, it stops returning the complete list and delivers only one value.
Full Script:
import requests
from bs4 import BeautifulSoup
url = "http://sports.williamhill.com/bet/pt/betlive/9"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, "html.parser")
jogos = soup.find_all("tr", class_="rowLive")
with open ('Lista_de_Jogos.csv', 'a', newline='', encoding='UTF8') as f:
for tag in jogos:
namelist = tag.find("a", href=True).get_text().strip()
linkslist = tag.find("a", href=True)["href"]
row = namelist + ';' + linkslist + '\n'
f.write(row)

The error message is obvious. Since your using a context manager - with(...) you should write the code within that indentation block.
import requests
from bs4 import BeautifulSoup
url = "http://sports.williamhill.com/bet/pt/betlive/9"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, "html.parser")
jogos = soup.find_all("tr", class_="rowLive")
with open("Lista_de_Jogos.csv", "a", newline="", encoding="UTF8") as f:
for tag in jogos:
namelist = tag.find("a", href=True).get_text().strip()
linkslist = tag.find("a", href=True)["href"]
row = namelist + ";" + linkslist + "\n"
f.write(row)

You have to indent the code after the 'with open' statement.
import requests
from bs4 import BeautifulSoup
url = "http://sports.williamhill.com/bet/pt/betlive/9"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, "html.parser")
jogos = soup.find_all("tr", class_="rowLive")
print (jogos)
with open ('Lista_de_Jogos.csv', 'a', newline='', encoding='UTF8') as f:
for tag in jogos:
namelist = tag.find("a", href=True).get_text().strip()
linkslist = tag.find("a", href=True)["href"]
row = namelist + ';' + linkslist + '\n'
f.write(row)

404 causing urllib request to freeze in try catch block

I am iterating through multiple pages with the same url except for the number at the end. Once it reaches a 404, however, it freezes the program, even though I am catching the exception in a try block. Am I missing something here? Here is my code. The program hangs once it hits https://www.tenable.com/plugins/nessus/14587
import bs4 as bs
from urllib.request import urlopen, Request
import urllib
ID = 14580
while ID < 132734:
#ID == 14391
ID == 14580
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
reg_url = "https://www.tenable.com/plugins/nessus/" + str(ID)
req = Request(url=reg_url, headers=headers)
try:
source = urlopen(req).read()
except urllib.error.HTTPError as e:
if e.getcode() == 404: # check the return code
continue
raise
soup = bs.BeautifulSoup(source,'lxml')
print(ID)
print(reg_url)
print(soup.title.string)
ID += 1
UPDATED WORKING CODE:
import bs4 as bs
from urllib.request import urlopen, Request
import urllib
ID = 14580
while ID < 132734:
#ID == 14391
ID == 14580
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
reg_url = "https://www.tenable.com/plugins/nessus/" + str(ID)
req = Request(url=reg_url, headers=headers)
try:
source = urlopen(req).read()
except urllib.error.HTTPError as e:
if e.getcode() == 404: # check the return code
ID +=1
continue
raise
soup = bs.BeautifulSoup(source,'lxml')
print(ID)
print(reg_url)
print(soup.title.string)
ID += 1

I added another increment to ID inside the exception block as seen in the updated code and it works fine now

AttributeError: 'unicode' object has no attribute 'fromstring'. How to get around this?

I'm trying to detect the availability of an item on Amazon. Why doesn't this code work?
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from time import sleep
import time
from lxml import html
import json
def check(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
page = requests.get(url, headers = headers)
for i in range(20):
sleep(3)
doc = html.fromstring(page.content)
XPATH_AVAILABILITY = '//div[#id ="availability"]//text()'
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
return AVAILABILITY
file_name = raw_input("Enter file name: ")
filepath = "%s"%(file_name)
with open(filepath) as f:
listoflinks = [line.rstrip('\n') for line in f]
all_links = []
for i in listoflinks:
html = req.get(i)
doc = SimplifiedDoc(html)
amazon_links = doc.getElements('a')
amazon_links = amazon_links.containsOr(['https://www.amazon.com/','https://amzn.to/'],attr='href')
for a in amazon_links:
if a.href not in all_links:
all_links.append(a.href)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
for i in all_links:
print "LINK:"
print i
response = requests.get(i, headers=headers)
#soup = BeautifulSoup(html, "lxml")
soup = BeautifulSoup(response.content, features="lxml")
title = soup.select("#productTitle")[0].get_text().strip()
if check(i) == 'In stock.':
price = soup.select("#priceblock_saleprice")[0].get_text()
else:
price = "UNAVAILABLE"
review_count = int(soup.select("#acrCustomerReviewText")[0].get_text().split()[0])
jsonObject = {'title': title, 'price': price, 'review_count': review_count}
print json.dumps(jsonObject, indent=2)
print "////////////////////////////////////////////////"
print "..............................................."
print "FINALLY..."
print "# OF LINKS RETRIEVED:"
print len(all_links)
When I execute it, this error appears:
File "scra.py", line 17, in check
doc = html.fromstring(page.content)
AttributeError: 'unicode' object has no attribute 'fromstring'
Please help me. I already tried converting page to pagedata = page.json() but it only made it worse.

Try using this instead of html.fromstring
doc = BeautifulSoup(page.content, 'html.parser')
doc = doc.prettify()

'Request' is not defined-python 3

Global name 'Request' is not defined.
#!/usr/bin/env python
import BeautifulSoup
import requests
link = ''
# sitekey retrieval
def get_sitekey():
captcha_page = Request(link, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36'
'(KHTML, like Gecko) Chrome/56.0.2924.28 Safari/537.36'})
product_page = urlopen(captcha_page)
soup = BeautifulSoup(product_page, 'html.parser')
sitekey = soup.find('div', attrs={'class': 'g-recaptcha'})['data-sitekey']
print(sitekey)
if __name__ == '__main__':
get_sitekey()

You need to access the Request object from within the request module.
#!/usr/bin/env python
import BeautifulSoup
import requests
link = ''
# sitekey retrieval
def get_sitekey():
captcha_page = requests.Request(link, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36'
'(KHTML, like Gecko) Chrome/56.0.2924.28 Safari/537.36'})
product_page = urlopen(captcha_page)
soup = BeautifulSoup(product_page, 'html.parser')
sitekey = soup.find('div', attrs={'class': 'g-recaptcha'})['data-sitekey']
print(sitekey)
if __name__ == '__main__':
get_sitekey()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

python unicode char from requests/bs4 - python

I did small changes in get_lyrics function: return final_lyrics.encode('latin1').decode('utf-8') and got desired output: # python2 print get_lyrics('kashèr') ... Rabbi, Papa, Allah, Lama, Imam, Bibbia, Dharma, Sura, Torah, Pane, Vino, Kashèr, Ḥalāl, Yom Kippur, Quaresima, Ramadan ...

Related

download a pdf from a website and change title - python and curl

Save .csv file with names in first column and links in second column (IndentationError: expected an indented block)

404 causing urllib request to freeze in try catch block

AttributeError: 'unicode' object has no attribute 'fromstring'. How to get around this?

'Request' is not defined-python 3

Categories

Resources