Help what to do. When you try to search, it often displays a name with an encoding error in the squeak. That is, for example (УкÑаинÑкий VOD поÑÑаÐ)
Code
from base64 import encode
import requests
from lxml.html import fromstring
from googlesearch import search
from time import sleep as wait
import os
os.system('cls || clear')
query = input('Уведіть ключові слова : ')
list_url = []
while 1:
try:
col = int(input('Количество запросов : '))
break
except ValueError:
print('Введите число')
for j in search(query, tld="co.in", num=col, stop=col, pause=2):
list_url.append(j)
if list_url != []:
for i in list_url:
wait(0.1)
r = requests.get(i)
tree = fromstring(r.content)
Title = tree.findtext('.//title')
print(f'\r[{Title}] - {i}\n')
try:
os.remove('.google-cookie')
except FileNotFoundError:
pass
else:
print('Empty')
input('\nExit\n')
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
Related
I made a python function to get all of the categories and their child node until the last one. I want the output to be like this: {'https://www.amazon.ae/gp/bestsellers/appliances/': ['Heating And Cooling', 'https://www.amazon.ae/gp/bestsellers/appliances/12134072031']['Air Conditioners', 'https://www.amazon.ae/gp/bestsellers/kitchen/15298093031']['Cabinet Air Conditioners', 'https://www.amazon.ae/gp/bestsellers/kitchen/15298093031']
My code:
import requests
from bs4 import BeautifulSoup as bs
import time
from tqdm import tqdm
_seen_categories = []
def crawl(url):
r = requests.get(url)
time.sleep(2)
s = bs(r.text, "html.parser")
try:
treeitems = s.find("span", class_="_p13n-zg-nav-tree-all_style_zg-selected__1SfhQ").find_next("div", {"role": "group"}).find_all("div", {"role": "treeitem"})
except:
treetiems = None
fullDict = []
for treeitem in tqdm(treeitems):
a = treeitem.find_next("a")
d = {url:[a.text.strip(), a["href"]]}
fullDict.append(d)
print(a.text.strip())
print(a["href"])
if treeitems is not None:
next_url = "https://www.amazon.ae"+a['href']
try:
if next_url not in _seen_categories:
crawl(next_url)
except:
pass
else:
_seen_categories.append(next_url)
time.sleep(2)
crawl("https://www.amazon.ae/gp/bestsellers/appliances")
This function is not formatting as expected. Need help to complete this.
Python code get stuck in the try block
`
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
#import urllib2
def url1_to_string(url1):
html=""
proxyDict = {
'http': 'http://username:pwd#proxyurl:8080',
'https': 'https://username:pwd#proxyurl:8080'
}
try:
print('Before res in try')
res = requests.get(url1,proxies=proxyDict)
print('After res in try')
except:
pass
html = res.text
soup = BeautifulSoup(html, 'html5lib')
for script in soup(["script", "style", 'aside']):
script.extract()
return " ".join(re.split(r'[\n\t]+', soup.get_text()))
df=pd.read_csv(r'C:\filepath\abc.csv',encoding='latin-1')
anchor_count = []
account_count = []
aggregate_page_count=[]
agg_url_count=[]
for index, row in df.iterrows():
agg_url_list = []
ini_url="http://www.google.com/search?q="+row['ANCHOR_NAME']+" AND "+row['ACCOUNT_NAME']
r = requests.get(ini_url,proxies={"http":"http://one.proxy.att.com:8080"})
ny_bb1 = url1_to_string(ini_url)
anchor_count.append(ny_bb1.lower().count(row['ANCHOR_NAME'].lower()))
account_count.append(ny_bb1.lower().count(row['ACCOUNT_NAME'].lower()))
print(anchor_count)
soup = BeautifulSoup(r.text,"html.parser")
get_details1 = soup.find_all("div", attrs={"class": "g"})
sublist1 = []
for details1 in get_details1:
link1 = details1.find_all("h3")
for mdetails1 in link1[:]:
links1 = mdetails1.find_all("a")
lmk1 = ""
for lnk1 in links1[:]:
lmk1 = lnk1.get("href")[7:].split("&")
sublist1.append(lmk1[0])
aggregate_count1=0
for x1 in sublist1[:3]:
anchorcount1=0
accountcount1=0
print("aagg url",x1)
try:
print('In try block')
ny_bb1 = url1_to_string(x1)
except KeyboardInterrupt: print('You cancelled the operation.')
finally:
pass
ny_bb1=ny_bb1.upper()
print(ny_bb1)
row['ANCHOR_NAME']=row['ANCHOR_NAME'].upper()
row['ACCOUNT_NAME']=row['ACCOUNT_NAME'].upper()
anchor_name=re.match(r'\W*(\w[^,. !?"]*)', row['ANCHOR_NAME']).groups()[0]
account_name=re.match(r'\W*(\w[^,. !?"]*)', row['ACCOUNT_NAME']).groups()[0]
if(anchor_name==account_name):
if(row['ANCHOR_NAME'] in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(row['ACCOUNT_NAME'] in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
else:
if (anchor_name in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(account_name in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
if(anchorcount1 > 0 and accountcount1 > 0):
aggregate_count1=aggregate_count1+1
agg_url_list.append(x1[:])
print("existance of both",aggregate_count1)
aggregate_page_count.append(aggregate_count1)
agg_url_count.append(agg_url_list)
df['anc_cnt']=pd.Series(anchor_count)
df['acc_cnt']=pd.Series(account_count)
df['agg_cnt']=pd.Series(aggregate_page_count)
df['agg_url_list']=pd.Series(agg_url_count)
`
The contents of the abc.csv file as follows ::
ANCHOR_NAME,ACCOUNT_NAME
ABC,ABC
XYZ,ZYZ
and so on
For particular URL's the code gets stuck in the try block and control does not come to except block where I want to ignore the exception and continue with normal program flow, as executing the next URL's and so on.
try:
import sys
# For Python 3.0 and later
from urllib.request import urlopen
except ImportError:
# Fall back to Python 2's urllib2
import sys
from urllib2 import urlopen
def fetch_words(url):
html = urlopen(url)
print(html.read())
story_words = []
for line in html:
line_words = line.decode('utf-8').split()
for word in line_words:
story_words.append(word)
return story_words
def print_items(items):
for item in items:
print(item)
def main():
url = sys.argv[1]
words = fetch_words(url)
print_items(words)
if name == '__main__'
main()
You might have other errors, but surely, name is not declared, so
if name == 'main'
will raise you an error. Instead, it should be
if __name__ == '__main__'
I am trying to get the user statuses from Weibo, but I keep having this error.
import re
import string
import sys
import os
import urllib
import urllib2
from bs4 import BeautifulSoup
import requests
from lxml import etree
reload(sys)
sys.setdefaultencoding('utf-8')
if(len(sys.argv)>=2):
user_id = (int)(sys.argv[1])
else:
user_id = (int)(raw_input("input user_id: "))
cookie = {"Cookie": "******my cookies"}
url = 'http://weibo.cn/u/%d?filter=1&page=1'%user_id
html = requests.get(url, cookies = cookie).content
selector = etree.HTML(html)
pageNum = (int)(selector.xpath('//input[#name="mp"]')[0].attrib['value'])
result = ""
urllist_set = set()
word_count = 1
image_count = 1
print 'spider is ready...'
for page in range(1,pageNum+1):
url = 'http://weibo.cn/u/%d?filter=1&page=%d'%(user_id,page)
lxml = requests.get(url, cookies = cookie).content
selector = etree.HTML(lxml)
content = selector.xpath('//span[#class="ctt"]')
for each in content:
text = each.xpath('string(.)')
if word_count>=4:
text = "%d :"%(word_count-3) +text+"\n\n"
else :
text = text+"\n\n"
result = result + text
word_count += 1
fo = open("/Users/apple/Desktop/%s"%user_id, "wb")
fo.write(result)
word_path=os.getcwd()+'/%d'%user_id
print 'done'
Error:
File "weibo_spider.py", line 25, in <module>
pageNum = (int)(selector.xpath('//input[#name="mp"]')[0].attrib['value'])
IndexError: list index out of range
You are assuming selector.path will always find something, but this isn't true most of cases. So build habit of defensive programming. See Defensive Programming
Try replacing
pageNum = (int)(selector.xpath('//input[#name="mp"]')[0].attrib['value'])
With:
controls = selector.xpath('//input[#name="mp"]')
if controls:
pageNum = int(controls[0].attrib['value'])
I've been reading tens of examples for similar issues, but I can't get any of the solutions I've seen or their variants to run. I'm screen scraping, and I just want to ignore 404 errors (skip the pages). I get
'AttributeError: 'module' object has no attribute 'HTTPError'.
I've tried 'URLError' as well. I've seen the near identical syntax accepted as working answers. Any ideas? Here's what I've got:
import urllib
import datetime
from bs4 import BeautifulSoup
class EarningsAnnouncement:
def __init__(self, Company, Ticker, EPSEst, AnnouncementDate, AnnouncementTime):
self.Company = Company
self.Ticker = Ticker
self.EPSEst = EPSEst
self.AnnouncementDate = AnnouncementDate
self.AnnouncementTime = AnnouncementTime
webBaseStr = 'http://biz.yahoo.com/research/earncal/'
earningsAnnouncements = []
dayVar = datetime.date.today()
for dte in range(1, 30):
currDay = str(dayVar.day)
currMonth = str(dayVar.month)
currYear = str(dayVar.year)
if (len(currDay)==1): currDay = '0' + currDay
if (len(currMonth)==1): currMonth = '0' + currMonth
dateStr = currYear + currMonth + currDay
webString = webBaseStr + dateStr + '.html'
try:
#with urllib.request.urlopen(webString) as url: page = url.read()
page = urllib.request.urlopen(webString).read()
soup = BeautifulSoup(page)
tbls = soup.findAll('table')
tbl6= tbls[6]
rows = tbl6.findAll('tr')
rows = rows[2:len(rows)-1]
for earn in rows:
earningsAnnouncements.append(EarningsAnnouncement(earn.contents[0], earn.contents[1],
earn.contents[3], dateStr, earn.contents[3]))
except urllib.HTTPError as err:
if err.code == 404:
continue
else:
raise
dayVar += datetime.timedelta(days=1)
It looks like for urllib (not urllib2) that the exception is urllib.error.HTTPError, not urllib.HTTPError. See the documentation for more information.
Do this :
import urllib.error# import
except urllib.error.URLError as e:# use 'urllib.error.URLError' and not 'urllib.HTTPError'
print ('Error code: ', e.code)# or what ever u want
return e.code