I'm trying to read the multistream Wikipedia dump into a database. This is my attempt at loading smaller chunks in parallel. Here's the script:
#!/usr/bin/python3
import xml.sax
from bz2 import BZ2File
import mwparserfromhell
import psycopg2
import pathos
import os
import dill
class XmlHandler(xml.sax.handler.ContentHandler):
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self._buffer = None
self._values = {}
self._current_tag = None
self._pages = []
def get_pages(self):
return self._pages
def get_page_count(self):
return len(self._pages)
def get_values(self):
return self._values
def characters(self, content):
if self._current_tag:
self._buffer.append(content)
def startElement(self, name, attrs):
if name in ('title', 'text', 'infobox'):
self._current_tag = name
self._buffer = []
def endElement(self, name):
if name == self._current_tag:
self._values[name] = ' '.join(self._buffer)
if name == 'page':
self.process_article()
def process_article(self):
wikicode = mwparserfromhell.parse(self._values['text'])
infobox_array = wikicode.filter_templates(matches="infobox .*")
infobox = str(infobox_array[0]) if len(infobox_array) > 0 else ""
self._pages.append((self._values['title'], self._values['text'], infobox))
def load_xml(filename):
wiki_handler = XmlHandler()
wiki_parser = xml.sax.make_parser()
wiki_parser.setContentHandler(wiki_handler)
file = os.path.join("chunks", filename)
print("I'm a worker process")
cursor = conn.cursor()
with BZ2File(file, 'r') as f:
for line in f:
wiki_parser.feed(line)
pages = wiki_handler.get_pages()
for page in pages:
cursor.execute("INSERT INTO pages (title, text, infobox) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING", page)
cursor.close()
print("all done")
if __name__ == "__main__":
conn = psycopg2.connect(dbname="wikipedia",
user="postgres",
password="postgres",
host="localhost",
port=5432)
file_list = [f for f in os.listdir("chunks") if os.path.isfile(os.path.join("chunks", f))]
pool = pathos.multiprocessing.ProcessingPool(processes=pathos.multiprocessing.cpu_count())
pool.map(load_xml, file_list)
And the traceback:
Traceback (most recent call last):
File "./loader_parallel.py", line 114, in <module>
pool.map(load_xml, file_list)
File "/home/smh/.local/lib/python3.7/site-packages/multiprocess/pool.py", line 268, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/home/smh/.local/lib/python3.7/site-packages/multiprocess/pool.py", line 657, in get
raise self._value
multiprocess.pool.MaybeEncodingError: Error sending result:
'<multiprocess.pool.ExceptionWithTraceback object at 0x7f87ac0f4470>'.
Reason: 'TypeError("can't pickle pyexpat.xmlparser objects")'
Why can't the pyexpat.xmlparser object be pickled? How can I fix it? I tried testing it by running dill.copy(XmlHandler()) and it did so without error.
I installed pathos via pip3, running Python 3.7, on Debian 10. Pretty new to this, any help is appreciated, thanks in advance!
Related
I built a scraper to run through a job site and save all potential job data to a csv file, then my MySQL database. For some reason, the scraper stops running after pulling jobs from the first city in the list. Here's what I mean:
City List Code:
Cities = {
'cities':[ 'washingtondc',
'newyork',
'sanfrancisco',
'...',
'...']
}
Scrapy Spider Code:
# -*- coding: utf-8 -*-
from city_list import Cities
import scrapy, os, csv, glob, pymysql.cursors
class JobsSpider(scrapy.Spider):
name = 'jobs'
c_list = Cities['cities']
for c in c_list:
print(f'Searching {c} for jobs...')
allowed_domains = [f'{c}.jobsite.com']
start_urls = [f'https://{c}.jobsite.com/search/jobs/']
def parse(self, response):
listings = response.xpath('//li[#class="listings-path"]')
for listing in listings:
date = listing.xpath('.//*[#class="date-path"]/#datetime').extract_first()
link = listing.xpath('.//a[#class="link-path"]/#href').extract_first()
text = listing.xpath('.//a[#class="text-path"]/text()').extract_first()
yield scrapy.Request(link,
callback=self.parse_listing,
meta={'date': date,
'link': link,
'text': text})
next_page_url = response.xpath('//a[text()="next-path "]/#href').extract_first()
if next_page_url:
yield scrapy.Request(response.urljoin(next_page_url), callback=self.parse)
def parse_listing(self, response):
date = response.meta['date']
link = response.meta['link']
text = response.meta['text']
compensation = response.xpath('//*[#class="compensation-path"]/span[1]/b/text()').extract_first()
employment_type = response.xpath('//*[#class="employment-type-path"]/span[2]/b/text()').extract_first()
images = response.xpath('//*[#id="images-path"]//#src').extract()
address = response.xpath('//*[#id="address-path"]/text()').extract()
yield {'date': date,
'link': link,
'text': text,
'compensation': compensation,
'type': employment_type,
'images': images,
'address': address}
def close(self, reason):
csv_file = max(glob.iglob('*.csv'), key=os.path.getctime)
conn = pymysql.connect(host='localhost',
user='root',
password='**********',
db='jobs_database',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
cur = conn.cursor()
csv_data = csv.reader(open('jobs.csv'))
for row in csv_data:
cur.execute('INSERT INTO jobs_table(date, link, text, compensation, type, images, address)' 'VALUES(%s, %s, %s, %s, %s, %s, %s)', row)
conn.commit()
conn.close()
print("Done Importing!")
The scraper works fine, but it stops running after grabbing jobs from washingtondc and exits.
How do I solve this problem?
UPDATE -
I changed the code above to
class JobsSpider(scrapy.Spider):
name = 'jobs'
allowed_domains = []
start_urls = []
def __init__(self, *args, **kwargs):
super().__init__(self, *args, **kwargs)
c_list = Cities['cities']
for c in c_list:
print(f'Searching {c} for jobs...')
self.allowed_domains.append(f'{c}.jobsearch.com')
self.start_urls.append(f'https://{c}.jobsearch.com/search/jobs/')
def parse(self, response):
...
and am now getting "RecursionError: maximum recursion depth exceeded while calling a Python object"
Here's the traceback:
Traceback (most recent call last):
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/logging/__init__.py", line 1034, in emit
msg = self.format(record)
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/logging/__init__.py", line 880, in format
return fmt.format(record)
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/logging/__init__.py", line 619, in format
record.message = record.getMessage()
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/logging/__init__.py", line 380, in getMessage
msg = msg % self.args
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/spiders/__init__.py", line 107, in __str__
return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/spiders/__init__.py", line 107, in __str__
return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
File "/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scrapy/spiders/__init__.py", line 107, in __str__
return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
[Previous line repeated 479 more times]
RecursionError: maximum recursion depth exceeded while calling a Python object
The first problem is that your spider variables and methods are inside the for loop. Instead, you need to set those member variables in __init__(). Without testing the rest of your logic, here's a rough idea of what you need to do instead:
class JobsSpider(scrapy.Spider):
name = 'jobs'
# Don't do the for loop here.
allowed_domains = []
start_urls = []
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
c_list = Cities['cities']
for c in c_list:
self.allowed_domains.append(f'{c}.jobsite.com')
self.start_urls.append(f'https://{c}.jobsite.com/search/jobs/')
def parse(self, request):
# ...
If you still have problems after this, update your question and I will try to update the answer.
To explain what's going wrong: When you have a for loop like in your question, it's going to end up overwriting the variables and functions. Here is an example directly in Python's shell:
>>> class SomeClass:
... for i in range(3):
... print(i)
... value = i
... def get_value(self):
... print(self.value)
...
0
1
2
>>> x = SomeClass()
>>> x.value
2
>>> x.get_value()
2
Basically the for loop is executed before you even use the class. So this doesn't end up running the function multiple times, but rather redefining it multiple times. The end result is that your functions and variables point to whatever was set last.
When I run $ python -m weatherterm -a USNY1192:1:US -p WeatherComParser -td
to output the parsed data in terminal, I get following error:
Traceback (most recent call last):
File "C:\Python36\lib\runpy.py", line 193, in _run_module_as_main "__main__", mod_spec)
File "C:\Python36\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\Users\Reza\weather-app\weatherterm\weatherterm\__main__.py", line 81, in <module>
parser = cls()
File "C:\Users\Reza\weather-app\weatherterm\weatherterm\parsers\weather_com_parser.py", line 25, in __init__
self._request = Request(self._base_url)
TypeError: object() takes no parameters
Appreciate if you could advise what went wrong here:
from weatherterm.core import ForecastType
import re
from weatherterm.core import Forecast
from weatherterm.core import Request
from weatherterm.core import Unit
from weatherterm.core import UnitConverter
from bs4 import BeautifulSoup
class WeatherComParser:
def __init__(self):
self._forecast = {
ForecastType.TODAY: self._today_forecast,
ForecastType.FIVEDAYS: self._five_and_ten_days_forecast,
ForecastType.TENDAYS: self._five_and_ten_days_forecast,
ForecastType.WEEKEND: self._weekend_forecast,
}
self._base_url = 'http://weather.com/weather/{forecast}/l/{area}'
self._request = Request(self._base_url)
self._temp_regex = re.compile('([0-9]+)\D{,2}([0-9]+)')
self._only_digits_regex = re.compile('[0-9]+')
self._unit_converter = UnitConverter(Unit.FAHRENHEIT)
def _get_additional_info(self, content):
data = tuple(item.td.span.get_text()
for item in content.table.tbody.children)
return data[:2]
def _clear_str_number(self, str_number):
result = self._only_digits_regex.match(str_number)
return '--' if result is None else result.group()
def _parse(self, container, criteria):
results = [self._get_data(item, criteria)
for item in container.children]
return [result for result in results if result]
def _get_data(self, container, search_items):
scraped_data = {}
for key, value in search_items.items():
result = container.find(value, class_=key)
data = None if result is None else result.get_text()
if data is not None:
scraped_data[key] = data
return scraped_data
def _today_forecast(self, args):
# raise NotImplementedError()
criteria = {
'today_nowcard-temp': 'div',
'today_nowcard-phrase': 'div',
'today_nowcard-hilo': 'div',
}
content = self._request.fetch_data(args.forecast_option.value,
args.area_code)
bs = BeautifulSoup(content, 'html.parser')
container = bs.find('section', class_='today_nowcard-container')
weather_conditions = self._parse(container, criteria)
if len(weather_conditions) < 1:
raise Exception('Could not parse weather forecast for today.')
weatherinfo = weather_conditions[0]
temp_regex = re.compile(('H\s+([0-9]+|\-{,2}).+'
'L\s+([0-9]+|\-{,2})'))
temp_info = temp_regex.search(weatherinfo['today_nowcard-hilo'])
high_temp, low_temp = temp_info.groups()
side = container.find('div', class_='today_nowcard-sidecar')
wind, humidity = self._get_additional_info(side)
curr_temp = self._clear_str_number(weatherinfo['today_nowcard-temp'])
self._unit_converter.dest_unit = args.unit
td_forecast = Forecast(self._unit_converter.convert(curr_temp),
humidity,
wind,
high_temp=self._unit_converter.convert(
high_temp),
low_temp=self._unit_converter.convert(
low_temp),
description=weatherinfo['today_nowcard-phrase'])
return [td_forecast]
def _five_and_ten_days_forecast(self, args):
raise NotImplementedError()
def _weekend_forecast(self, args):
raise NotImplementedError()
def run(self, args):
self._forecast_type = args.forecast_option
forecast_function = self._forecast[args.forecast_option]
return forecast_function(args)
Looking at the call stack, it seems to me that it is something wrong in the Request class in weatherterm/core/request.py. The initializer of the Request class should look like this:
class Request:
def __init__(self, base_url):
...
Make sure that the initializer also gets a base_url parameter.
I'm trying to create a Collection Class in Python to access the various collections in my db. Here's what I've got:
import sys
import os
import pymongo
from pymongo import MongoClient
class Collection():
client = MongoClient()
def __init__(self, db, collection_name):
self.db = db
self.collection_name = collection_name
# self.data_base = getattr(self.client, db)
# self.collObject = getattr(self.data_base, self.collection_name)
def getCollection(self):
data_base = getattr(self.client, self.db)
collObject = getattr(data_base, self.collection_name)
return collObject
def getCollectionKeys(self, collection):
"""Get a set of keys from a collection"""
keys_list = []
collection_list = collection.find()
for document in collection_list:
for field in document.keys():
keys_list.append(field)
keys_set = set(keys_list)
return keys_set
if __name__ == '__main__':
print"Begin Main"
agents = Collection('hkpr_restore','agents')
print "agents is" , agents
agents_collection = agents.getCollection
print agents_collection
print agents.getCollectionKeys(agents_collection)
I get the following output:
Begin Main
agents is <__main__.Collection instance at 0x10ff33e60>
<bound method Collection.getCollection of <__main__.Collection instance at 0x10ff33e60>>
Traceback (most recent call last):
File "collection.py", line 52, in <module>
print agents.getCollectionKeys(agents_collection)
File "collection.py", line 35, in getCollectionKeys
collection_list = collection.find()
AttributeError: 'function' object has no attribute 'find'
The function getCollectionKeys works fine outside of a class. What am I doing wrong?
This line:
agents_collection = agents.getCollection
Should be:
agents_collection = agents.getCollection()
Also, you don't need to use getattr the way you are. Your getCollection method can be:
def getCollection(self):
return self.client[self.db][self.collection_name]
I'm trying to get my emails to work from commandline with mutt. I have been trying to follow these two guides: http://blog.developwithpassion.com/2013/05/02/getting-up-and-running-with-a-sane-mutt-setup/ and http://stevelosh.com/blog/2012/10/the-homely-mutt/#configuring-offlineimap
To do this there are 4 main steps:
1. setup offlineimap to download and keep synced your emails
2. setup mutt (the email user interface)
3. setup notmuch (to be able to search your emails)
4. setup msmtp (to be able to send emails)
Note that I am using Macbook Pro running the OS X 10.9.2. I am stuck at step 1 because I am getting an error with offlineimap! I am able to run offlineimap for a long time, i.e. it will sync all of the emails (36197 of them!) and then right at the end it spits out the following error:
ERROR: getfolder() asked for a nonexisting folder 'Drafts'.
Folder Deleted Items [acc: gloriphobia]:
Establishing connection to imap.gmail.com:993
Account sync gloriphobia:
*** Finished account 'gloriphobia' in 137:16
ERROR: Exceptions occurred during the run!
ERROR: command: UID => socket error: <class 'socket.error'> - [Errno 54] Connection reset by peer
Traceback:
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/folder/IMAP.py", line 219, in getmessage
'(BODY.PEEK[])')
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/imaplib2.py", line 1167, in uid
return self._simple_command('UID', command, *args, **kw)
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/imaplib2.py", line 1615, in _simple_command
return self._command_complete(self._command(name, *args), kw)
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/imaplib2.py", line 1378, in _command_complete
typ, dat = rqb.get_response('command: %s => %%s' % rqb.name)
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/imaplib2.py", line 176, in get_response
raise typ(exc_fmt % str(val))
ERROR: getfolder() asked for a nonexisting folder 'Drafts'.
Traceback:
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/accounts.py", line 241, in syncrunner
self.sync()
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/accounts.py", line 320, in sync
localfolder = self.get_local_folder(remotefolder)
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/accounts.py", line 269, in get_local_folder
replace(self.remoterepos.getsep(), self.localrepos.getsep()))
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/repository/Maildir.py", line 134, in getfolder
OfflineImapError.ERROR.FOLDER)
My .offlineimaprc is:
[general]
accounts = gloriphobia
ui = TTYUI
pythonfile=~/Development/MuttMailPython/offline.py
fsync = False
[Account gloriphobia]
localrepository = gloriphobia_local
remoterepository = gloriphobia_remote
status_backend = sqlite
postsynchook = notmuch new
[Repository gloriphobia_local]
type = Maildir
localfolders = ~/.mail/Test
nametrans = get_remote_name
[Repository gloriphobia_remote]
maxconnections = 1
type = Gmail
cert_fingerprint = 89091347184d41768bfc0da9fad94bfe882dd358
remoteuser = myemailaddress
remotepasseval = get_keychain_pass(account="myemailaddress",server="imap.gmail.com")
realdelete = no
nametrans = get_local_name
folderfilter = is_included
My python file, the one that is called offline.py is:
#!/usr/bin/python
import subprocess
import re
class NameMapping:
def __init__(self, local_name, remote_name):
self.local_name = local_name
self.remote_name = remote_name
class LocalName:
def __init__(self, folder):
self.folder = folder
def matches(self, mapping):
return mapping.remote_name == self.folder
def mapped_folder_name(self, mapping):
return mapping.local_name
class RemoteName:
def __init__(self, folder):
self.folder = folder
def matches(self, mapping):
return mapping.local_name == self.folder
def mapped_folder_name(self, mapping):
return mapping.remote_name
def get_keychain_pass(account=None, server=None):
params = {
'security': '/usr/bin/security',
'command': 'find-internet-password',
'account': account,
'server': server,
'keychain': '/Users/mec07/Library/Keychains/login.keychain',
}
command = "sudo -u mec07 %(security)s -v %(command)s -g -a %(account)s -s %(server)s %(keychain)s" % params
output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
outtext = [l for l in output.splitlines()
if l.startswith('password: ')][0]
return re.match(r'password: "(.*)"', outtext).group(1)
def is_included(folder):
result = True
for pattern in exclusion_patterns:
result = result and (re.search(pattern, folder) == None)
return result
exclusion_patterns = [
"efax",
"earth_class_mail",
"eventbrite",
"gotomeeting",
"moshi_monsters",
"peepcode",
"raini_fowl",
"stuart_know",
"training.*2008",
"training.*2009",
"training.*2010",
"training.*2011",
"training.*2012",
"training.*nbdn",
"training.*nothin_but_bdd",
"unblock_us",
"web_hosting",
"webinars",
"Gmail.*Important"
]
name_mappings = [
NameMapping('inbox', '[Gmail]/Inbox'),
NameMapping('starred', '[Gmail]/Starred'),
NameMapping('important', '[Gmail]/Important'),
NameMapping('sent', '[Gmail]/Sent Mail'),
NameMapping('drafts', '[Gmail]/Drafts'),
NameMapping('archive', '[Gmail]/All Mail'),
NameMapping('spam', '[Gmail]/Spam'),
NameMapping('flagged', '[Gmail]/Starred'),
NameMapping('trash', '[Gmail]/Trash'),
NameMapping('deleted', '[Gmail]/Deleted Items'),
NameMapping('Mum', '[Gmail]/Jana'),
NameMapping('Maggie', '[Gmail]/Maggie'),
NameMapping('papers', '[Gmail]/Scholar Alert'),
NameMapping('sent items', '[Gmail]/Sent Items'),
NameMapping('sent messages', '[Gmail]/Sent Messages')
]
def find_name_mapping(name):
default_mapping = NameMapping(name.folder, name.folder)
for mapping in name_mappings:
if (name.matches(mapping)):
return mapping
return default_mapping
def get_name_mapping(name):
mapping = find_name_mapping(name)
return name.mapped_folder_name(mapping)
def get_remote_name(local_folder_name):
name = RemoteName(local_folder_name)
return get_name_mapping(name)
def get_local_name(remote_folder_name):
name = LocalName(remote_folder_name)
return get_name_mapping(name)
Thanks in advance for your help!
Add:
folderfilter = lambda folder: folder not in ['Drafts,]
I need to create some files from a template. I'm using psycopg2 to fetch from a database. Then I loop through. Now I need to write to file.
Thanks!
import sys
from HTMLParser import HTMLParser
from xml.etree import cElementTree as etree
import psycopg2
import psycopg2.extras
class LinksParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.tb = etree.TreeBuilder()
def handle_starttag(self, tag, attributes):
self.tb.start(tag, dict(attributes))
def handle_endtag(self, tag):
self.tb.end(tag)
def handle_data(self, data):
self.tb.data(data)
def close(self):
HTMLParser.close(self)
return self.tb.close()
conn = psycopg2.connect(dbname="**", user="**", password="**", host="/tmp/", port="**")
cur.execute("SELECT * FROM landingpagedata;")
rows = cur.fetchall()
template = 'template.html'
parser = LinksParser()
# parser.feed(open('landingIndex.html').read()) #for testing
# root = parser.close()
for row in rows:
parser.feed(open(template).read())
root = parser.close()
#title
title = root.find(".//title")
title.text = str(row['title'])
f = open(row['page_name'], 'w')
root.write(f)
parser = LinksParser()
The error is:
Traceback (most recent call last):
File "/Users/Morgan13/Programming/LandingPageBuilder/landingPages/landingBuilderTest.py", line 87, in <module>
main()
File "/Users/Morgan13/Programming/LandingPageBuilder/landingPages/landingBuilderTest.py", line 75, in main
root.write('page_name')
AttributeError: write
Oh and I'm using open('page', 'w') because these pages exist already?
I think you want f.write(root), not root.write(f). (Assuming that str(root) gives you the HTML you want to write out.)