list index out of range error in Scrapy

list index out of range error in Scrapy - python

I am trying to get the user statuses from Weibo, but I keep having this error.
import re
import string
import sys
import os
import urllib
import urllib2
from bs4 import BeautifulSoup
import requests
from lxml import etree
reload(sys)
sys.setdefaultencoding('utf-8')
if(len(sys.argv)>=2):
user_id = (int)(sys.argv[1])
else:
user_id = (int)(raw_input("input user_id: "))
cookie = {"Cookie": "******my cookies"}
url = 'http://weibo.cn/u/%d?filter=1&page=1'%user_id
html = requests.get(url, cookies = cookie).content
selector = etree.HTML(html)
pageNum = (int)(selector.xpath('//input[#name="mp"]')[0].attrib['value'])
result = ""
urllist_set = set()
word_count = 1
image_count = 1
print 'spider is ready...'
for page in range(1,pageNum+1):
url = 'http://weibo.cn/u/%d?filter=1&page=%d'%(user_id,page)
lxml = requests.get(url, cookies = cookie).content
selector = etree.HTML(lxml)
content = selector.xpath('//span[#class="ctt"]')
for each in content:
text = each.xpath('string(.)')
if word_count>=4:
text = "%d :"%(word_count-3) +text+"\n\n"
else :
text = text+"\n\n"
result = result + text
word_count += 1
fo = open("/Users/apple/Desktop/%s"%user_id, "wb")
fo.write(result)
word_path=os.getcwd()+'/%d'%user_id
print 'done'
Error:
File "weibo_spider.py", line 25, in <module>
pageNum = (int)(selector.xpath('//input[#name="mp"]')[0].attrib['value'])
IndexError: list index out of range

You are assuming selector.path will always find something, but this isn't true most of cases. So build habit of defensive programming. See Defensive Programming
Try replacing
pageNum = (int)(selector.xpath('//input[#name="mp"]')[0].attrib['value'])
With:
controls = selector.xpath('//input[#name="mp"]')
if controls:
pageNum = int(controls[0].attrib['value'])

Related

Python Requests lxml Encoding

Help what to do. When you try to search, it often displays a name with an encoding error in the squeak. That is, for example (Ð£ÐºÑÐ°Ð¸Ð½ÑÐºÐ¸Ð¹ VOD Ð¿Ð¾ÑÑÐ°Ð)
Code
from base64 import encode
import requests
from lxml.html import fromstring
from googlesearch import search
from time import sleep as wait
import os
os.system('cls || clear')
query = input('Уведіть ключові слова : ')
list_url = []
while 1:
try:
col = int(input('Количество запросов : '))
break
except ValueError:
print('Введите число')
for j in search(query, tld="co.in", num=col, stop=col, pause=2):
list_url.append(j)
if list_url != []:
for i in list_url:
wait(0.1)
r = requests.get(i)
tree = fromstring(r.content)
Title = tree.findtext('.//title')
print(f'\r[{Title}] - {i}\n')
try:
os.remove('.google-cookie')
except FileNotFoundError:
pass
else:
print('Empty')
input('\nExit\n')
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀

urllib.request HTTPError: Unauthorized

I'm having some problem with this code.I have a Http request where I need to request some information from an API.I'm having troubles with the urllib request, I don't think I'm using the right library because the code I got it uses urllib2 put since python 3 is different I don't know which one to use.
import urllib.request
import json
import os
from pprint import pprint
REST_URL = "http://data.bioontology.org"
API_KEY = "MYAPIKEY"
def get_json(url):
opener = urllib.request.urlopen(url)
opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)]
return json.loads(opener.open(url).read())
print(url)
# Get list of search terms
path = os.path.join(os.path.dirname("__file__"), 'classes_search_terms.txt')
terms_file = open(path, "r")
terms = []
for line in terms_file:
terms.append(line)
# Do a search for every term
search_results = []
for term in terms:
search_results.append(get_json(REST_URL + "/search?q=" + term)["collection"])
# Print the results
for result in search_results:
pprint(result)

How to specify a range without limiting to numbers

This is the code that gets the restaurant reviews. I am collecting reviews.
The range is specified and imported. But there is a problem. Each store has a different number of reviews. Store with little reviews should go to next store soon.
I am suffering from too large a scope. But it can not reduce the scope. This is because some shops have reviews in this range.
How can I work effectively?
I saw find all?(element) searching this code. But I do not know if I've mistakenly applied my code.
#python3
import sys
from bs4 import BeautifulSoup
import urllib.request
import requests
from urllib.parse import quote
import time
import os
import xlwt
import random
import re
FISRT_URL = "https://www.yelp.com/search?
find_desc=Korean+Food&find_loc=Seattle,+WA&start="
LAST_URL = "&cflt=korean"
def get_link(URL, doc_name):
global jisu_i
global num
global page
for jisu_i in range(1):
current_page_num = 20 + jisu_i*10
position = URL.index('t=')
URL_with_page_num = URL[: position+2] + str(current_page_num) \
+ URL[position+2 :]
print(URL_with_page_num)
importurl = URL_with_page_num
r = requests.get(importurl)
soup = BeautifulSoup(r.content.decode('euc-kr','replace'), "lxml")
time.sleep(random.randint(10, 15))
for title in soup.find_all('h3')[page+2:21]:
page = page + 1
title_link = title.select('a')
for jisu_m in range(130):
print(page)
last_URL = title_link[0]['href']
print(last_URL)
first_URL = "https://www.yelp.com"
global article_URL
article_URL = first_URL + last_URL
time.sleep(random.randint(15, 30))
jisuurl = article_URL
for k in range(99): #
jisu_page_num = 0 + k * 20 #
position = jisuurl.index('?')
URL_with_page_num = jisuurl[: position + 1] + str("start=") + str(jisu_page_num)
jisu_with_page_num = URL_with_page_num
print(jisu_with_page_num)
jisu_importurl = jisu_with_page_num
get_text(URL, jisu_importurl, doc_name)
time.sleep(random.randint(40,180))

Yelp has a very well documented API here: https://www.yelp.com/developers/documentation/v3
This is the only reliable way of interacting with the site programatically.

Slow html parser. How to increase the speed?

I would like to estimate the impact of the news on the Dow Jones quotes. For this, I wrote the Python html parser, using the beutifullsoup library. I extract an article and store it in XML file for the further analysis using NLTK library. How can I increase the speed of parsing? The code below does the required task, but in a very slow manner.
Here is the code of the html parser:
import urllib2
import re
import xml.etree.cElementTree as ET
import nltk
from bs4 import BeautifulSoup
from datetime import date
from dateutil.rrule import rrule, DAILY
from nltk.corpus import stopwords
from collections import defaultdict
def main_parser():
#starting date
a = date(2014, 3, 27)
#ending date
b = date(2014, 3, 27)
articles = ET.Element("articles")
f = open('~/Documents/test.xml', 'w')
#loop through the links and per each link extract the text of the article, store the latter at xml file
for dt in rrule(DAILY, dtstart=a, until=b):
url = "http://www.reuters.com/resources/archive/us/" + dt.strftime("%Y") + dt.strftime("%m") + dt.strftime("%d") + ".html"
page = urllib2.urlopen(url)
#use html5lib ??? possibility to use another parser
soup = BeautifulSoup(page.read(), "html5lib")
article_date = ET.SubElement(articles, "article_date")
article_date.text = str(dt)
for links in soup.find_all("div", "headlineMed"):
anchor_tag = links.a
if not 'video' in anchor_tag['href']:
try:
article_time = ET.SubElement(article_date, "article_time")
article_time.text = str(links.text[-11:])
article_header = ET.SubElement(article_time, "article_name")
article_header.text = str(anchor_tag.text)
article_link = ET.SubElement(article_time, "article_link")
article_link.text = str(anchor_tag['href']).encode('utf-8')
try:
article_text = ET.SubElement(article_time, "article_text")
#get text and remove all stop words
article_text.text = str(remove_stop_words(extract_article(anchor_tag['href']))).encode('ascii','ignore')
except Exception:
pass
except Exception:
pass
tree = ET.ElementTree(articles)
tree.write("~/Documents/test.xml","utf-8")
#getting the article text from the spicific url
def extract_article(url):
plain_text = ""
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html, "html5lib")
tag = soup.find_all("p")
#replace all html tags
plain_text = re.sub(r'<p>|</p>|[|]|<span class=.*</span>|<a href=.*</a>', "", str(tag))
plain_text = plain_text.replace(", ,", "")
return str(plain_text)
def remove_stop_words(text):
text=nltk.word_tokenize(text)
filtered_words = [w for w in text if not w in stopwords.words('english')]
return ' '.join(filtered_words)

Several fixes can be applied (without changing modules you are currently using):
use lxml parser instead of html5lib - it is much much (and 3 more muches) faster
parse only a part of document with SoupStrainer (note that html5lib doesn't support SoupStrainer - it will always parse the whole document slowly)
Here's how the code would look like after the changes. Brief performance test shows at least 3x improvement:
import urllib2
import xml.etree.cElementTree as ET
from datetime import date
from bs4 import SoupStrainer, BeautifulSoup
import nltk
from dateutil.rrule import rrule, DAILY
from nltk.corpus import stopwords
def main_parser():
a = b = date(2014, 3, 27)
articles = ET.Element("articles")
for dt in rrule(DAILY, dtstart=a, until=b):
url = "http://www.reuters.com/resources/archive/us/" + dt.strftime("%Y") + dt.strftime("%m") + dt.strftime(
"%d") + ".html"
links = SoupStrainer("div", "headlineMed")
soup = BeautifulSoup(urllib2.urlopen(url), "lxml", parse_only=links)
article_date = ET.SubElement(articles, "article_date")
article_date.text = str(dt)
for link in soup.find_all('a'):
if not 'video' in link['href']:
try:
article_time = ET.SubElement(article_date, "article_time")
article_time.text = str(link.text[-11:])
article_header = ET.SubElement(article_time, "article_name")
article_header.text = str(link.text)
article_link = ET.SubElement(article_time, "article_link")
article_link.text = str(link['href']).encode('utf-8')
try:
article_text = ET.SubElement(article_time, "article_text")
article_text.text = str(remove_stop_words(extract_article(link['href']))).encode('ascii', 'ignore')
except Exception:
pass
except Exception:
pass
tree = ET.ElementTree(articles)
tree.write("~/Documents/test.xml", "utf-8")
def extract_article(url):
paragraphs = SoupStrainer('p')
soup = BeautifulSoup(urllib2.urlopen(url), "lxml", parse_only=paragraphs)
return soup.text
def remove_stop_words(text):
text = nltk.word_tokenize(text)
filtered_words = [w for w in text if not w in stopwords.words('english')]
return ' '.join(filtered_words)
Note that I've removed the regular expression processing from extract_article() - looks like you can just get the whole text from the p tags.
I might have introduced some problems - please check if everything is correct.
Another solution would be to use lxml for everything from parsing (replace beautifulSoup) to creating the xml (replace xml.etree.ElementTree).
Another solution (definitely the fastest) would be to switch to Scrapy web-scraping web-framework.
It is simple and very fast. There are all kind of batteries, you can imagine, included. For example there are link extractors, XML exporters, database pipelines etc. Worth looking.
Hope that helps.

You want to pick the best parser.
We benchmark most of the parser / platform when building: http://serpapi.com
Here is a full article on Medium:
https://medium.com/#vikoky/fastest-html-parser-available-now-f677a68b81dd

Request a html file

This link lets me get a random item from database. However, I would like to automatically retrieve items using Python. Here's my code:
import sys
from urllib.parse import urlencode
from urllib.request import urlopen
# parameters
data = {}
data["query"] = "reviewd:yes+AND+organism:9606"
data["random"] = "yes"
url_values = urlencode(data)
url = "http://www.uniprot.org/uniprot/"
full_url = url + '?' + url_values
data = urlopen(full_url)
out = open("1.html", 'w')
out.write(str(data.read()))
However, I cannot get the desired page. Anyone knows what's wrong with my code? I'm using Python 3.x.

You have several issues:
reviewd is misspelled, it should be reviewed
The base url needs to have /uniprot/ at the end
You need to use space instead of + in your query string
Here is what that would look like:
import sys
from urllib.parse import urlencode
from urllib.request import urlopen
# parameters
data = {}
data["query"] = "reviewed:yes AND organism:9606"
data["random"] = "yes"
url_values = urlencode(data)
url = "http://www.uniprot.org/uniprot/"
full_url = url + '?' + url_values
data = urlopen(full_url)
out = open("1.html", 'w')
out.write(str(data.read()))
This produces the following URL:
http://www.uniprot.org/uniprot/?query=reviewed%3Ayes+AND+organism%3A9606&random=yes

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

list index out of range error in Scrapy - python

Related

Python Requests lxml Encoding

urllib.request HTTPError: Unauthorized

How to specify a range without limiting to numbers

Slow html parser. How to increase the speed?

Request a html file

Categories

Resources