UnicodeEncodeError when typing Chinese into url - python

import urllib.request
import bs4
key_word = input('What is the good you are searching for?')
price_low_limit = input('What are the lowest price restrictions?')
price_high_limit = input('What are the highest price restrictions?')
url_jd = 'https://search.jd.com/search?keyword={}&enc=utf-8&qrst=2&rt=1&stop=1&vt=2&wq={}&ev=exprice_{}-{}%5E&uc=0#J_searchWrap'.format(key_word, key_word, price_low_limit, price_high_limit)
response = urllib.request.urlopen(url_jd)
text = response.read().decode()
html = bs4.BeautifulSoup(text, 'html.parser')
total_item_j = []
for information in html.find_all('div', {'class': "gl-i-wrap"}):
for a in information.find_all('a', limit=1):
a_title = a['title']
a_href = a['href']
for prices in information.find_all('i', limit=1):
a = prices.text
item_j = {}
item_j['price'] = float(a)
item_j['name'] = a_title
item_j['url'] = a_href
total_item_j.append(item_j)
print(total_item_j)
This is a project I do in the school. I want to use this program to extract prices of goods I search. Currently, this code can work for English Input in python 3.7. However, if I search the good in Chinese, for example '巧克力' (Chocolate), it would turn out a Unicode Encode Error. Please, help me out.

You just want to ensure your string is encoded correctly. If you change key_word to:
key_word = u'巧克力'.encode('utf-8')
You'll find it works fine.
So your code would look like:
import urllib.request
import bs4
key_word = input('What is the good you are searching for?')
key_word = key_word.encode('utf-8')
...
More on unicode in python here

If you look at the stack trace, you'll see something like:
# Non-ASCII characters should have been eliminated earlier
--> 983 self._output(request.encode('ascii'))
ASCII encoding will fail for the characters in your key_word variable.
They should be URL-escaped first. Do that with:
key_word = urllib.parse.quote_plus(key_word)
And then prepare the url_jd string.

Related

Python: Get element next to href

Python code:
url = 'https://www.basketball-reference.com/players/'
initial = list(string.ascii_lowercase)
initial_url = [url + i for i in initial]
html_initial = [urllib.request.urlopen(i).read() for i in initial_url]
soup_initial = [BeautifulSoup(i, 'html.parser') for i in html_initial]
tags_initial = [i('a') for i in soup_initial]
print(tags_initial[0][50])
Results example:
Shareef Abdur-Rahim
From the example above, I want to extract the name of the players which is 'Shareef Abdur-Rahim', but I want to do it for all the tags_initial lists,
Does anyone have an idea?
Could you modify your post by adding your code so that we can help you better?
Maybe that could help you :
name = soup.findAll(YOUR_SELECTOR)[0].string
UPDATE
import re
import string
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://www.basketball-reference.com/players/'
# Alphabet
initial = list(string.ascii_lowercase)
datas = []
# URLS
urls = [url + i for i in initial]
for url in urls:
# Soup Object
soup = BeautifulSoup(urlopen(url), 'html.parser')
# Players link
url_links = soup.findAll("a", href=re.compile("players"))
for link in url_links:
# Player name
datas.append(link.string)
print("datas : ", datas)
Then, "datas" contains all the names of the players, but I advise you to do a little processing afterwards to remove some erroneous information like "..." or perhaps duplicates
There are probably better ways but I'd do it like this:
html = "a href=\"/teams/LAL/2021.html\">Los Angeles Lakers</a"
index = html.find("a href")
index = html.find(">", index) + 1
index_end = html.find("<", index)
print(html[index:index_end])
If you're using a scraper library it probably has a similar function built-in.

Python: unnable to get any output using beautifulsoup

I am trying to scrape some words from any random website, but the following program is not showing errors and not showing any output when i tried printing the results.
I have checked the code twice and even incorporated an if statement to see whether the program is getting any words or not.
import requests
import operator
from bs4 import BeautifulSoup
def word_count(url):
wordlist = []
source_code = requests.get(url)
source = BeautifulSoup(source_code.text, features="html.parser")
for post_text in source.findAll('a', {'class':'txt'}):
word_string=post_text.string
if word_string is not None:
word = word_string.lower().split()
for each_word in word:
print(each_word)
wordlist.append(each_word)
else:
print("None")
word_count('https://mumbai.craigslist.org/')
I am expecting all the words under the "class= txt" to be displayed in the output.
OP: I am expecting all the words of the class text to be displayed in the output
The culprit:
for post_text in source.findAll('a', {'class':'txt'}):
The reason:
anchor tag has no class txt but the span tag inside it does.
Hence:
import requests
from bs4 import BeautifulSoup
def word_count(url):
source_code = requests.get(url)
source=BeautifulSoup(source_code.text, features="html.parser")
for post_text in source.findAll('a'):
s_text = post_text.find('span', class_ = "txt")
if s_text is not None:
print(s_text.text)
word_count('https://mumbai.craigslist.org/')
OUTPUT:
community
activities
artists
childcare
classes
events
general
groups
local news
lost+found
missed connections
musicians
pets
.
.
.
You are targeting the wrong elements.
if you use
print(source)
Everything works fine but the moment you try to target the element with findAll you are targeting something wrong because you get an empty list array.
If you replace
for post_text in source.findAll('a', {'class':'txt'}):
with
for post_text in source.find_all('a'):
everyting seems to work fine
I have visited https://mumbai.craigslist.org/, and find there is no <a class="txt">, only <span class="txt">, so I think you can try this:
def word_count(url):
wordlist = []
source_code = requests.get(url)
source=BeautifulSoup(source_code.text, features="html.parser")
for post_text in source.findAll('span', {'class':'txt'}):
word_string=post_text.text
if word_string is not None:
word = word_string.lower().split ()
for each_word in word:
print(each_word)
wordlist.append(each_word)
else:
print("None")
it will output correctly:
community
activities
artists
childcare
classes
events
general
...
Hope that helps you, and comment if you have further questions. : )

Improve Regex to catch complete emails from Google search?

In order to practice and help my sister get emails from doctors for her baby, I have designed this email harvester. It makes a search, cleans the urls given, adds them to a dictionary and parse them for emails in two different ways.
The code has been taken from different places, so if you correct me, please explain clearly your improvement, as I am working at the limit of my knowledge already.
The question is how to get emails better (and improve code, if possible). I'll post the code and the exact output below:
CODE of my program:
import requests, re, webbrowser, bs4
from selenium import webdriver
from bs4 import BeautifulSoup
import time, random, webbrowser
import urllib.request
def google_this(): #Googles and gets the first few links
search_terms = ['Fiat','Lambrusco']
added_terms = 'email contact? #'
#This searches for certain keywords in Google and parses results with BS
for el in search_terms:
webpage = 'http://google.com/search?q=' + str(el) + str(added_terms)
print('Searching for the terms...', el,added_terms)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
#res.raise_for_status()
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
serp_res_rawlink = soup.select('.r a')
dicti = [] #This gets the href links
for link in serp_res_rawlink:
url = link.get('href')
if 'pdf' not in url:
dicti.append(url)
dicti_url = [] #This cleans the "url?q=" from link
for el in dicti:
if '/url?q=' in el:
result = (el.strip('/url?q='))
dicti_url.append(result)
#print(dicti_url)
dicti_pretty_links = [] #This cleans the gibberish at end of url
for el in dicti_url[0:4]:
pretty_url = el.partition('&')[0]
dicti_pretty_links.append(pretty_url)
print(dicti_pretty_links)
for el in dicti_pretty_links: #This converts page in BS soup
# browser = webdriver.Firefox()
# browser.get(el)
# print('I have been in the element below and closed the window')
# print(el)
# time.sleep(1)
# browser.close()
webpage = (el)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
#res.raise_for_status()
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
#This is the first way to search for an email in soup
emailRegex = re.compile(r'([a-zA-Z0-9_.+]+#+[a-zA-Z0-9_.+])', re.VERBOSE)
mo = emailRegex.findall(res.text)
#mo = emailRegex.findall(soup.prettify())
print('THIS BELOW IS REGEX')
print(mo)
#This is the second way to search for an email in soup:
mailtos = soup.select('a[href^=mailto]')
for el in mailtos:
print('THIS BELOW IS MAILTOS')
print(el.text)
time.sleep(random.uniform(0.5,1))
google_this()
And here is the OUTPUT when this very same code above. As you can see, some emails seem to be found, but at cut just after the "#" symbol:
C:\Users\SK\AppData\Local\Programs\Python\Python35-32\python.exe C:/Users/SK/PycharmProjects/untitled/another_temperase.py
Searching for the terms... Fiat email contact? #
['http://www.fcagroup.com/en-US/footer/Pages/contacts.aspx', 'http://www.fiat.co.uk/header-contacts', 'http://www.fiatusa.com/webselfservice/fiat/', 'https://twitter.com/nic_fincher81/status/672505531689394176']
THIS BELOW IS REGEX
['investor.relations#f', 'investor.relations#f', 'sustainability#f', 'sustainability#f', 'mediarelations#f', 'mediarelations#f']
THIS BELOW IS MAILTOS
investor.relations#fcagroup.com
THIS BELOW IS MAILTOS
sustainability#fcagroup.com
THIS BELOW IS MAILTOS
mediarelations#fcagroup.com
THIS BELOW IS REGEX
[]
THIS BELOW IS REGEX
[]
THIS BELOW IS REGEX
['nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y']
Searching for the terms... Lambrusco email contact? #
['http://www.labattagliola.it/%3Flang%3Den']
Process finished with exit code 0
I would recommend a more restrictive version that still catches all of the email:
([a-zA-Z0-9_.+]+#[a-zA-Z0-9_.+]+)
The problem of not catching anything after the first letter after the # is because the regex is missing a +
([a-zA-Z0-9_.+]+#+[a-zA-Z0-9_.+]+)
Originally this part [a-zA-Z0-9_.+] simply said to catch one of any of the following characters a-z, A-Z, 0-9, ., _,+.
I would also be careful about #+ which says to catch 1 or more "#" symbols.
So a potentially valid email could look like this:
..................########################.................

BeautifulSoup returning unrelated HTML

I'm trying to parse basketball stat data from pages like http://www.sports-reference.com/cbb/boxscores/2014-11-14-kentucky.html. I'm using Python 2.7.6 and BeautifulSoup 4-4.3.2. I'm searching gamelogs like the above page for the class "sortable" in order to get access to the raw stat data contained within the tables. I am only interested in the "Basic Stats" for each team.
However, the HTML that BeautifulSoup is returning is not at all what I expect. Instead I get a list of all-time team records and data for every school that has ever played. I don't have enough reputation to post a second link here of the output or I would.
Basically, there are four class "sortable" tables on the boxscore page. When I ask BS to find them by the only way I can think of to distinguish them from the other data, it instead returns completely irrelevant data and I can't even figure out where the returned data comes from.
Here's the code:
import urllib2
import re
import sys
from bs4 import BeautifulSoup
class Gamelogs():
def __init__(self):
#the base bage that has all boxscore links
self.teamPageSoup = BeautifulSoup(urllib2.urlopen(
'http://www.sports-reference.com/cbb/schools/' + school +
'/2015-gamelogs.html'))
#use regex to only find links with score data
self.statusPageLinks = self.teamPageSoup.findAll(href=re.compile(
"boxscores"));
def scoredata(links, school):
#for each link in the school's season
for l in links:
gameSoup = BeautifulSoup(urllib2.urlopen(l))
#remove extra link formatting to get just filename alone
l = l[59+len(school):]
#open a local file with that filename to store the results
fo = open(str(l),"w")
#create a list that will hold the box score data only
output = gameSoup.findAll(class_="sortable")
#write it line by line to the file that was just opened
for o in output:
fo.write(str(o) + '\n')
fo.close
def getlinks(school):
gamelogs = Gamelogs()
#open a new file to store the output
fo = open(school + '.txt',"w")
#remove extraneous links
gamelogs.statusPageLinks = gamelogs.statusPageLinks[2:]
#create the list that will hold each school's seasonlong boxscores
boxlinks = list()
for s in gamelogs.statusPageLinks:
#make the list element a string so it can be sliced
string = str(s)
#remove extra link formatting
string = string[9:]
string = string[:-16]
#create the full list of games per school
boxlinks.insert(0, 'http://www.sports-reference.com/cbb/schools/'
+ school + string)
scoredata(boxlinks, school)
if __name__ == '__main__':
#for each school as a commandline argument
for arg in sys.argv[1:]:
school = arg
getlinks(school)
Is this a problem with BS, my code, or the site? T
It looks like this is an issue with your code. The page that you are getting back sounds like this one: http://www.sports-reference.com/cbb/schools/?redir
Whenever I enter an invalid school name I am redirected to a page showing stats for 477 different teams. FYI: team names in the url are also case sensitive.

Finding information on a website without an external module

I am creating a program in Python where you search up a tv show/movie, and from IMDb, it gives you:
The title, year, rating, age rating, and synopsis of the movie.
I want to use no external modules at all, only the ones that come with Python 3.4.
I know I will have to use urllib, but I do not know where to go from there.
How would I do this?
This is an example taken from here:
import json
from urllib.parse import quote
from urllib.request import urlopen
def search(title):
API_URL = "http://www.omdbapi.com/?r=json&s=%s"
title = title.encode("utf-8")
url = API_URL % quote(title)
data = urlopen(url).read().decode("utf-8")
data = json.loads(data)
if data.get("Response") == "False":
print(data.get("Error", "Unknown error"))
return data.get("Search", [])
Then you can do:
>>> search("Idiocracy")
[{'Year': '2006', 'imdbID': 'tt0387808', 'Title': 'Idiocracy'}]
It's maybe too complex but:
I look at the webpage code. I look where the info I want is and then I extract the info.
import urllib.request
def search(title):
html = urllib.request.urlopen("http://www.imdb.com/find?q="+title).read().decode("utf-8")
f=html.find("<td class=\"result_text\"> <a href=\"",0)+34
openlink=""
while html[f]!="\"":
openlink+= html[f]
f+=1
html = urllib.request.urlopen("http://www.imdb.com"+openlink).read().decode("utf-8")
f = html.find("<meta property='og:title' content=\"",0)+35
titleyear=""
while html[f] !="\"":
titleyear+=html[f]
f+=1
f = html.find("title=\"Users rated this ",0)+24
rating = ""
while html[f] !="/":
rating+= html[f]
f+=1
f=html.find("<meta name=\"description\" content=\"",0)+34
shortdescription = ""
while html[f] !="\"":
shortdescription+=html[f]
f+=1
print (titleyear,rating,shortdescription)
return (titleyear,rating,shortdescription)
search("friends")
The number adding to f has to be just right, you count the lenght of the string you are searching, because find() returns you the position of the first letter in the string.
It looks bad, is there any other simpler way to do it?

Categories

Resources