python crawler ieee paper keywords - python

i trying to use crawler to get ieee paper keywords but now i get a error
how can to fix my crawler?
my code is here
import requests
import json
from bs4 import BeautifulSoup
ieee_content = requests.get("http://ieeexplore.ieee.org/document/8465981", timeout=180)
soup = BeautifulSoup(ieee_content.text, 'xml')
tag = soup.find_all('script')
for i in tag[9]:
s = json.loads(re.findall('global.document.metadata=(.*;)', i)[0].replace("'", '"').replace(";", ''))
and error is here
Traceback (most recent call last):
File "G:/github/爬蟲/redigg-leancloud/crawlers/sup_ieee_keywords.py", line 90, in <module>
a.get_es_data(offset=0, size=1)
File "G:/github/爬蟲/redigg-leancloud/crawlers/sup_ieee_keywords.py", line 53, in get_es_data
self.get_data(link=ieee_link, esid=es_id)
File "G:/github/爬蟲/redigg-leancloud/crawlers/sup_ieee_keywords.py", line 65, in get_data
s = json.loads(re.findall('global.document.metadata=(.*;)', i)[0].replace(";", '').replace("'", '"'))
IndexError: list index out of range

Here's another answer. I don't know what you are doing with 's' in your code after the load (replace) in my code.
The code below doesn't thrown an error, but again how are you using 's'
import requests
import json
from bs4 import BeautifulSoup
ieee_content = requests.get("http://ieeexplore.ieee.org/document/8465981", timeout=180)
soup = BeautifulSoup(ieee_content.text, 'xml')
tag = soup.find_all('script')
# i is a list
for i in tag[9]:
metadata_format = re.compile(r'global.document.metadata=.*', re.MULTILINE)
metadata = re.findall(metadata_format, i)
if len(metadata) != 0:
# convert the list
convert_to_json = json.dumps(metadata)
x = json.loads(convert_to_json)
s = x[0].replace("'", '"').replace(";", '')
###########################################
# I don't know what you plan to do with 's'
###########################################
print (s)

Apparently in line 65 some of the data provided in i did not suite the regex pattern you're trying to use. Therefor your [0] will not work as the data returned is not an array of suitable length.
Solution:
x = json.loads(re.findall('global.document.metadata=(.*;)', i)
if x:
s = x[0].replace("'", '"').replace(";", ''))

Related

Traceback <module> from extractdocx import * <module> from docx import opendocx, getdocumenttext from exceptions import PendingDeprecationWarning

Traceback (most recent call last):
File "C:\xampp\htdocs\Plag\scripts\main.py", line 8, in
from extractdocx import *
File "C:\xampp\htdocs\Plag\scripts\extractdocx.py", line 18, in
from docx import opendocx, getdocumenttext
File "C:\Users\zeesh\AppData\Local\Programs\Python\Python39\lib\site-packages\docx.py", line 30, in
from exceptions import PendingDeprecationWarning
ModuleNotFoundError: No module named 'exceptions'
# -*- coding: utf-8 -*-
# Master script for the plagiarism-checker
# Coded by: Shashank S Rao
#import other modules
from cosineSim import *
from htmlstrip import *
from extractdocx import *
#import required modules
import codecs
import traceback
import sys
import operator
import urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse
import json as simplejson
# Given a text string, remove all non-alphanumeric
# characters (using Unicode definition of alphanumeric).
def getQueries(text,n):
import re
sentenceEnders = re.compile('[.!?]')
sentenceList = sentenceEnders.split(text)
sentencesplits = []
for sentence in sentenceList:
x = re.compile(r'\W+', re.UNICODE).split(sentence)
x = [ele for ele in x if ele != '']
sentencesplits.append(x)
finalq = []
for sentence in sentencesplits:
l = len(sentence)
l=l/n
index = 0
for i in range(0,l):
finalq.append(sentence[index:index+n])
index = index + n-1
if index !=len(sentence):
finalq.append(sentence[len(sentence)-index:len(sentence)])
return finalq
# Search the web for the plagiarised text
# Calculate the cosineSimilarity of the given query vs matched content on google
# This is returned as 2 dictionaries
def searchWeb(text,output,c):
try:
text = text.encode('utf-8')
except:
text = text
query = urllib.parse.quote_plus(text)
if len(query)>60:
return output,c
#using googleapis for searching web
base_url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q='
url = base_url + '%22' + query + '%22'
request = urllib.request.Request(url,None,{'Referer':'Google Chrome'})
response = urllib.request.urlopen(request)
results = simplejson.load(response)
try:
if ( len(results) and 'responseData' in results and 'results' in results['responseData'] and results['responseData']['results'] != []):
for ele in results['responseData']['results']:
Match = results['responseData']['results'][0]
content = Match['content']
if Match['url'] in output:
#print text
#print strip_tags(content)
output[Match['url']] = output[Match['url']] + 1
c[Match['url']] = (c[Match['url']]*(output[Match['url']] - 1) + cosineSim(text,strip_tags(content)))/(output[Match['url']])
else:
output[Match['url']] = 1
c[Match['url']] = cosineSim(text,strip_tags(content))
except:
return output,c
return output,c
# Use the main function to scrutinize a file for
# plagiarism
def main():
# n-grams N VALUE SET HERE
n=9
if len(sys.argv) <3:
print ("Usage: python main.py <input-filename>.txt <output-filename>.txt")
sys.exit()
if sys.argv[1].endswith(".docx"):
t = docxExtract(sys.argv[1])
else:
t=open(sys.argv[1],'r')
if not t:
print ("Invalid Filename")
print ("Usage: python main.py <input-filename>.txt <output-filename>.txt")
sys.exit()
t=t.read()
queries = getQueries(t,n)
q = [' '.join(d) for d in queries]
found = []
#using 2 dictionaries: c and output
#output is used to store the url as key and number of occurences of that url in different searches as value
#c is used to store url as key and sum of all the cosine similarities of all matches as value
output = {}
c = {}
i=1
count = len(q)
if count>100:
count=100
for s in q[:100]:
output,c=searchWeb(s,output,c)
msg = "\r"+str(i)+"/"+str(count)+"completed..."
sys.stdout.write(msg);
sys.stdout.flush()
i=i+1
#print ("\n")
f = open(sys.argv[2],"w")
for ele in sorted(iter(c.items()),key=operator.itemgetter(1),reverse=True):
f.write(str(ele[0])+" "+str(ele[1]*100.00))
f.write("\n")
f.close()
print ("\nDone!")
if __name__ == "__main__":
try:
main()
except:
#writing the error to stdout for better error detection
error = traceback.format_exc()
print(("\nUh Oh!\n"+"Plagiarism-Checker encountered an error!:\n"+error)) ```
docx, last release was in 2014. The code imports module exceptions that was a top-level module in Python 2.7 but was removed in Python 3:
$ python2.7 -c "import exceptions"
$ python3.7 -c "import exceptions"
Traceback (most recent call last):
File "<string>", line 1, in <module>
ModuleNotFoundError: No module named 'exceptions'
The bottom line: the package is only for Python 2. Use Python 2.7 or find a different package.

BeautifulSoup4 and Requests Module 'IndexError: list index out of range'

I'm new to web scraping with python and am having a problem with the weather web scraping script I wrote. Here is the whole code 'weather.py':
#! python3
import bs4, requests
weatherSite = requests.get('https://weather.com/en-CA/weather/today/l/eef019cb4dca2160f08eb9714e30f28e05e624bbae351ccb6a855dbc7f14f017')
weatherSoup = bs4.BeautifulSoup(weatherSite.text, 'html.parser')
weatherLoc = weatherSoup.select('.CurrentConditions--location--kyTeL')
weatherTime = weatherSoup.select('.CurrentConditions--timestamp--23dfw')
weatherTemp = weatherSoup.select('.CurrentConditions--tempValue--3a50n')
weatherCondition = weatherSoup.select('.CurrentConditions--phraseValue--2Z18W')
weatherDet = weatherSoup.select('.CurrentConditions--precipValue--3nxCj > span:nth-child(1)')
location = weatherLoc[0].text
time = weatherTime[0].text
temp = weatherTemp[0].text
condition = weatherCondition[0].text
det = weatherDet[0].text
print(location)
print(time)
print(temp + 'C')
print(condition)
print(det)
It basically parses the weather information from 'The Weather Channel' and prints it out. This code was working fine yesterday when I wrote it. But, I tried today and it is giving me the following error:
Traceback (most recent call last):
File "C:\Users\username\filesAndStuff\weather.py", line 16, in <module>
location = weatherLoc[0].text
IndexError: list index out of range
Replace:
weatherLoc = weatherSoup.select('.CurrentConditions--location--kyTeL')
# print(weatherLoc)
# []
By:
weatherLoc = weatherSoup.select('h1[class*="CurrentConditions--location--"]')
# print(weatherLoc)
# [<h1 class="CurrentConditions--location--2_osB">Hamilton, Ontario Weather</h1>]
As you can see, your suffix kYTeL is not the same for me 2_osB. You need a partial match on class attribute (class*=) (note the *)

Multiprocessing with text scraping

I want to scrape <p> from pages and since there will be a couple thousands of them I want to use multiprocessing. However, it doesn't work when I try to append the result to some variable
I want to append the result of scraping to the data = []
I made a url_common for a base website since some pages don't start with HTTP etc.
from tqdm import tqdm
import faster_than_requests as requests #20% faster on average in my case than urllib.request
import bs4 as bs
def scrape(link, data):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
data.append(p.text)
Above doesn't work, since map() doesn't accept function like above
I tried to use it another way:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)
from multiprocessing import Pool
p = Pool(10)
links = ['link', 'other_link', 'another_link']
data = p.map(scrape, links)
I get this error while using above function:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 297, in _bootstrap
self.run()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 110, in worker
task = get()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\queues.py", line 354, in get
return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'scrape' on <module '__main__' (built-in)>
I have not figured a way to do it so that it uses Pool and at the same time appending the result of scraping to the given variable
EDIT
I change a little bit to see where it stops:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.investing.com/'
else:
url_common = ''
try: #tries are always halpful with url as you never know
ht = requests.get2str(url_common + str(i))
except:
pass
print('works1')
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
print('works2')
for p in paragraphs:
print(p.text)
links = ['link', 'other_link', 'another_link']
scrape(links)
#WORKS PROPERLY AND PRINTS EVERYTHING
if __name__ == '__main__':
p = Pool(5)
print(p.map(scrape, links))
#DOESN'T WORK, NOTHING PRINTS. Error like above
You are using the map function incorrectly.
It iterates over each element of the iterable and calls the function on each element.
You can see the map function as doing something like the following:
to_be_mapped = [1, 2, 3]
mapped = []
def mapping(x): # <-- note that the mapping accepts a single value
return x**2
for item in to_be_mapped:
res = mapping(item)
mapped.append(res)
So to solve your problem remove the outermost for-loop as iterating is handled by the map function
def scrape(link):
if link[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(link))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)

How do I read this stringified javascript variable into Python?

I'm trying to read _pageData from https://www.simpliowebstudio.com/wp-content/uploads/2014/07/aWfyh1 into Python 2.7.11 so that I can process it, using this code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" Testing _pageData processing. """
import urllib2
import re
import ast
import json
import yaml
BASE_URL = 'https://www.simpliowebstudio.com/wp-content/uploads/2014/07/aWfyh1'
def main():
""" Do the business. """
response = urllib2.urlopen(BASE_URL, None)
results = re.findall('var _pageData = \\"(.*?)\\";</script>', response.read())
first_result = results[0]
# These all fail
data = ast.literal_eval(first_result)
# data = yaml.load(first_result)
# data = json.loads(first_result)
if __name__ == '__main__':
main()
but get the following error:
Traceback (most recent call last):
File "./temp.py", line 24, in <module>
main()
File "./temp.py", line 19, in main
data = ast.literal_eval(first_result)
File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/ast.py", line 49, in literal_eval
node_or_string = parse(node_or_string, mode='eval')
File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/ast.py", line 37, in parse
return compile(source, filename, mode, PyCF_ONLY_AST)
File "<unknown>", line 1
[[1,true,true,true,true,true,true,true,true,,\"at\",\"\",\"\",1450364255674,\"\",\"en_US\",false,[]\n,\"https://www.google.com/maps/d/viewer?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/embed?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/edit?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/thumbnail?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",,,true,\"https://www.google.com/maps/d/print?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/pdf?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/viewer?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",false,false,\"/maps/d\",\"maps/sharing\",\"//www.google.com/intl/en_US/help/terms_maps.html\",true,\"https://docs.google.com/picker\",[]\n,false,true,[[[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-regular-001.png\",143,25]\n,[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-regular-2x-001.png\",286,50]\n]\n,[[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-small-001.png\",113,20]\n,[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-small-2x-001.png\",226,40]\n]\n]\n,1,\"https://www.gstatic.com/mapspro/_/js/k\\u003dmapspro.gmeviewer.en_US.8b9lQX3ifcs.O/m\\u003dgmeviewer_base/rt\\u003dj/d\\u003d0/rs\\u003dABjfnFWonctWGGtD63MaO3UZxCxF6UPKJQ\",true,true,false,true,\"US\",false,true,true,5,false]\n,[\"mf.map\",\"zBghbRiSwHlg.k2ATNtn6BCk0\",\"Hollywood, FL\",\"\",[-80.16005,26.01043,-80.16005,26.01043]\n,[-80.16005,26.01043,-80.16005,26.01043]\n,[[,\"zBghbRiSwHlg.kq4rrF9BNRIg\",\"Untitled layer\",\"\",[[[\"https://mt.googleapis.com/vt/icon/name\\u003dicons/onion/22-blue-dot.png\\u0026scale\\u003d1.0\"]\n,[]\n,1,1,[[,[26.01043,-80.16005]\n]\n,\"MDZBMzJCQjRBOTAwMDAwMQ~CjISKmdlby1tYXBzcHJvLm1hcHNob3AtbGF5ZXItNDUyOWUwMTc0YzhkNmI2ZBgAKAAwABIZACBawIJBU4Fe8v7vNSoAg0dtnhhVotEBLg\",\"vdb:\",\"zBghbRiSwHlg.kq4rrF9BNRIg\",[26.01043,-80.16005]\n,[0,-32]\n,\"06A32BB4A9000001\"]\n,[[\"Hollywood, FL\"]\n]\n,[]\n]\n]\n,,1.0,true,true,,,,[[\"zBghbRiSwHlg.kq4rrF9BNRIg\",1,,,,\"https://mapsengine.google.com/map/kml?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\\u0026lid\\u003dzBghbRiSwHlg.kq4rrF9BNRIg\",,,,,0,2,true,[[[\"06A32BB4A9000001\",[[[26.01043,-80.16005]\n]\n]\n,[]\n,[]\n,0,[[\"name\",[\"Hollywood, FL\"]\n,1]\n,,[]\n,[]\n]\n,,0]\n]\n,[[[\"https://mt.googleapis.com/vt/icon/name\\u003dicons/onion/22-blue-dot.png\\u0026filter\\u003dff\\u0026scale\\u003d1.0\",[16,32]\n,1.0]\n,[[\"0000FF\",0.45098039215686275]\n,5000]\n,[[\"0000FF\",0.45098039215686275]\n,[\"000000\",0.25098039215686274]\n,3000]\n]\n]\n]\n]\n]\n,[]\n,,,,,1]\n]\n,[2]\n,,,\"mapspro\",\"zBghbRiSwHlg.k2ATNtn6BCk0\",,true,false,false,\"\",2,false,\"https://mapsengine.google.com/map/kml?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",3807]\n]\n
^
SyntaxError: invalid syntax
var _pageData is in this format:
"[[1,true,true,true,true,true,true,true,true,,\"at\",\"\",\"\",1450364255674,\"\",\"en_US\",false,[]\n,\"https://www.google.com/maps/d/viewer?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/embed?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/edit?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/thumbnail?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",,,true,\"https://www.google.com/maps/d/print?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/pdf?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",\"https://www.google.com/maps/d/viewer?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",false,false,\"/maps/d\",\"maps/sharing\",\"//www.google.com/intl/en_US/help/terms_maps.html\",true,\"https://docs.google.com/picker\",[]\n,false,true,[[[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-regular-001.png\",143,25]\n,[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-regular-2x-001.png\",286,50]\n]\n,[[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-small-001.png\",113,20]\n,[\"//www.gstatic.com/mapspro/images/google-my-maps-logo-small-2x-001.png\",226,40]\n]\n]\n,1,\"https://www.gstatic.com/mapspro/_/js/k\\u003dmapspro.gmeviewer.en_US.8b9lQX3ifcs.O/m\\u003dgmeviewer_base/rt\\u003dj/d\\u003d0/rs\\u003dABjfnFWonctWGGtD63MaO3UZxCxF6UPKJQ\",true,true,false,true,\"US\",false,true,true,5,false]\n,[\"mf.map\",\"zBghbRiSwHlg.k2ATNtn6BCk0\",\"Hollywood, FL\",\"\",[-80.16005,26.01043,-80.16005,26.01043]\n,[-80.16005,26.01043,-80.16005,26.01043]\n,[[,\"zBghbRiSwHlg.kq4rrF9BNRIg\",\"Untitled layer\",\"\",[[[\"https://mt.googleapis.com/vt/icon/name\\u003dicons/onion/22-blue-dot.png\\u0026scale\\u003d1.0\"]\n,[]\n,1,1,[[,[26.01043,-80.16005]\n]\n,\"MDZBMzJCQjRBOTAwMDAwMQ~CjISKmdlby1tYXBzcHJvLm1hcHNob3AtbGF5ZXItNDUyOWUwMTc0YzhkNmI2ZBgAKAAwABIZACBawIJBU4Fe8v7vNSoAg0dtnhhVotEBLg\",\"vdb:\",\"zBghbRiSwHlg.kq4rrF9BNRIg\",[26.01043,-80.16005]\n,[0,-32]\n,\"06A32BB4A9000001\"]\n,[[\"Hollywood, FL\"]\n]\n,[]\n]\n]\n,,1.0,true,true,,,,[[\"zBghbRiSwHlg.kq4rrF9BNRIg\",1,,,,\"https://mapsengine.google.com/map/kml?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\\u0026lid\\u003dzBghbRiSwHlg.kq4rrF9BNRIg\",,,,,0,2,true,[[[\"06A32BB4A9000001\",[[[26.01043,-80.16005]\n]\n]\n,[]\n,[]\n,0,[[\"name\",[\"Hollywood, FL\"]\n,1]\n,,[]\n,[]\n]\n,,0]\n]\n,[[[\"https://mt.googleapis.com/vt/icon/name\\u003dicons/onion/22-blue-dot.png\\u0026filter\\u003dff\\u0026scale\\u003d1.0\",[16,32]\n,1.0]\n,[[\"0000FF\",0.45098039215686275]\n,5000]\n,[[\"0000FF\",0.45098039215686275]\n,[\"000000\",0.25098039215686274]\n,3000]\n]\n]\n]\n]\n]\n,[]\n,,,,,1]\n]\n,[2]\n,,,\"mapspro\",\"zBghbRiSwHlg.k2ATNtn6BCk0\",,true,false,false,\"\",2,false,\"https://mapsengine.google.com/map/kml?mid\\u003dzBghbRiSwHlg.k2ATNtn6BCk0\",3807]\n]\n"
I've tried replacing the \" and \n and decoding the \uXXXX before using, without success. I've also tried replacing ,, with ,"", and ,'', without success.
Thank you.
It seems like there are three kinds of syntactic errors in your string:
, followed by ,
[ followed by ,
, followed by ]
Assuming that those are supposed to be null elements (or ''?), you can just replace those in the original string -- exactly like you did for the ,, case, but you missed the others. Also, you have to do the ,, replacement twice, otherwise you will miss cases such as ,,,,. Then, you can load the JSON string with json.loads.
>>> s = "your messed up json string"
>>> s = re.sub(r",\s*,", ", null,", s)
>>> s = re.sub(r",\s*,", ", null,", s)
>>> s = re.sub(r"\[\s*,", "[ null,", s)
>>> s = re.sub(r",\s*\]", ", null]", s)
>>> json.loads(s)
I started off using ast.literal.eval(...) because I was under the (mistaken?) impression that javascript arrays and Python lists were mutually compatible, so all I had to do was destringify _pageData.
However, I hadn't noticed that Python doesn't like ,, true, false or [,. Fixing them does the trick (thank you #Two-Bit Alchemist and #tobias_k)
So, the following appears to work:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" Testing _pageData processing. """
import urllib2
import re
import ast
import json
import yaml
BASE_URL = 'https://www.simpliowebstudio.com/wp-content/uploads/2014/07/aWfyh1'
def main():
""" Do the business. """
response = urllib2.urlopen(BASE_URL, None)
results = re.findall('var _pageData = \\"(.*?)\\";</script>', response.read())
first_result = results[0]
first_result = first_result.replace(',,,,,,', ',None,None,None,None,None,')
first_result = first_result.replace(',,,,,', ',None,None,None,None,')
first_result = first_result.replace(',,,,', ',None,None,None,')
first_result = first_result.replace(',,,', ',None,None,')
first_result = first_result.replace(',,', ',None,')
first_result = first_result.replace('[,', '[None,')
first_result = first_result.replace('\\"', '\'')
first_result = first_result.replace('\\n', '')
first_result = first_result.replace('true', 'True')
first_result = first_result.replace('false', 'False')
data = ast.literal_eval(first_result)
for entry in data:
print entry
if __name__ == '__main__':
main()

TypeError: 'NoneType' object is not iterable: Webcrawler to scrape email addresses

I am trying to get the below program working. It is supposed to find email addresses in a website but, it is breaking. I suspect the problem is with initializing result = [] inside the crawl function. Below is the code:
# -*- coding: utf-8 -*-
import requests
import re
import urlparse
# In this example we're trying to collect e-mail addresses from a website
# Basic e-mail regexp:
# letter/number/dot/comma # letter/number/dot/comma . letter/number
email_re = re.compile(r'([\w\.,]+#[\w\.,]+\.\w+)')
# HTML <a> regexp
# Matches href="" attribute
link_re = re.compile(r'href="(.*?)"')
def crawl(url, maxlevel):
result = []
# Limit the recursion, we're not downloading the whole Internet
if(maxlevel == 0):
return
# Get the webpage
req = requests.get(url)
# Check if successful
if(req.status_code != 200):
return []
# Find and follow all the links
links = link_re.findall(req.text)
for link in links:
# Get an absolute URL for a link
link = urlparse.urljoin(url, link)
result += crawl(link, maxlevel - 1)
# Find all emails on current page
result += email_re.findall(req.text)
return result
emails = crawl('http://ccs.neu.edu', 2)
print "Scrapped e-mail addresses:"
for e in emails:
print e
The error I get is below:
C:\Python27\python.exe "C:/Users/Sagar Shah/PycharmProjects/crawler/webcrawler.py"
Traceback (most recent call last):
File "C:/Users/Sagar Shah/PycharmProjects/crawler/webcrawler.py", line 41, in <module>
emails = crawl('http://ccs.neu.edu', 2)
File "C:/Users/Sagar Shah/PycharmProjects/crawler/webcrawler.py", line 35, in crawl
result += crawl(link, maxlevel - 1)
File "C:/Users/Sagar Shah/PycharmProjects/crawler/webcrawler.py", line 35, in crawl
result += crawl(link, maxlevel - 1)
TypeError: 'NoneType' object is not iterable
Process finished with exit code 1
Any suggestions will help. Thanks!
The problem is this:
if(maxlevel == 0):
return
Currently it return None when maxlevel == 0. You can't concatenate a list with a None object.
You need to return an empty list [] to be consistent.

Categories

Resources