how to run scrapy from python script - python

my file list:
.
|-- lf
| |-- __init__.py
| |-- __init__.pyc
| |-- items.py
| |-- items.pyc
| |-- pipelines.py
| |-- settings.py
| |-- settings.pyc
| `-- spiders
| |-- bbc.py
| |-- bbc.pyc
| |-- __init__.py
| |-- __init__.pyc
| |-- lwifi.py
| `-- lwifi.pyc
|-- scrapy.cfg
`-- script.py
items.py
from scrapy.item import Item, Field
class LfItem(Item):
topic = Field();
script.py:
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from lf.spiders.lwifi import LwifiSpider
from scrapy.utils.project import get_project_settings
spider = LwifiSpider(domain='Lifehacker.co.in')
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
lwifi.py:
from scrapy.spider import Spider
from scrapy.selector import Selector
class LwifiSpider(Spider):
name = "lwifi"
def __init__(self, **kw):
super(LwifiSpider, self).__init__(**kw)
url = kw.get('url') or kw.get('domain') or 'lifehacker.co.in/others/Dont-Use- Personal-Information-in-Your-Wi-Fi-Network-Name/articleshow/45407704.cms'
if not url.startswith('http://') and not url.startswith('https://'):
url = 'http://%s/' % url
self.url = url
self.allowed_domains = ["lifehacker.co.in/others/Dont-Use-Personal-Information-in-Your-Wi-Fi-Network-Name/articleshow/45407704.cms"]
def start_requests(self):
return [Request(self.url, callback=self.parse)]
def parse(self, response):
topic = response.xpath("//h1/text()").extract();
print topic
i am new to python and scrapy. As a start i wrote a simple scrapy spider to run from python script (not using scrapinghub). My aim is to scrap the h1 from the page http://lifehacker.co.in/others/Dont-Use-Personal-Information-in-Your-Wi-Fi-Network-Name/articleshow/45407704.cms .The error is
Traceback (most recent call last):
File "script.py", line 4, in <module>
from lf.spiders.lwifi import LwifiSpider
File "/home/ajay/pythonpr/error/lf/lf/spiders/lwifi.py", line 7, in <module>
class LwifiSpider(Spider):
File "/home/ajay/pythonpr/error/lf/lf/spiders/lwifi.py", line 11, in LwifiSpider
url = kw.get('url') or kw.get('domain') or 'lifehacker.co.in/others/Dont-Use-Personal- Information-in-Your-Wi-Fi-Network-Name/articleshow/45407704.cms'
NameError: name 'kw' is not defined
please help.

If you look carefully at the traceback, you will see that the error occurs in the body of the LwifiSpider class:
File "/home/.../lwifi.py", line 11, in LwifiSpider
If the error occurred in the __init__ of that class you'd see a line like this instead:
File "/home/.../lwifi.py", line 11, in __init__
So it would appear that there is some kind of indentation error that is causing the problematic line to be outside of the __init__ method, where the kw argument cannot be seen.
Try re-indenting the whole of the __init__ function, and make sure you are not mixing tabs and spaces anywhere (any decent text editor should allow you to make all the whitespace visible).

Follow these simple steps on how to run Python/Ajax crawls. How to set interactive crawls using AJax

Related

Unable to import a module and dependancies in Python unit test

I am using Python's unittest library to add a unit test. This how the folder structure looks like in my project.
Project
|-- __init__.py
|
|-- src
| |
| |-- __init__.py
| |-- main.py
|
|-- test
|-- __init__.py
|-- test_basic.py
Code for main.py
from dotenv import load_dotenv
load_dotenv()
import newrelic.agent
newrelic.agent.initialize()
from .processAnalyticsData import run_query_and_upload_results_to_s3, upload_script_results_to_s3
import sys
import click
import getopt
import json
import os
BUCKET_NAME = os.environ["S3_BUCKET_NAME"]
cwd = os.getcwd()
#click.group(chain=True)
def run_scripts():
pass
#run_scripts.command('run_script')
#click.option('-f', '--frequency', required=True, nargs=1, type=click.Choice(['WEEKLY', 'DAILY', 'ONCE', 'MONTHLY'], case_sensitive=False))
def run_script(frequency):
with open(cwd + '/src/config.json') as config_file:
script_paths = json.load(config_file)
output_json = [x for x in script_paths["executable_scripts"] if x["frequency"] == frequency]
for item in output_json:
file_name = item["fileName"]
script_name = item["path"]
script_type = item["type"]
if script_type == 'sql':
run_query_and_upload_results_to_s3(script_name, BUCKET_NAME, file_name)
elif script_type == 'python':
upload_script_results_to_s3(script_name, BUCKET_NAME, file_name)
else:
raise Exception('Script type is incorrect. Please provide the correct value in the config file.')
if __name__ == "__main__":
run_scripts()
I started writing my unit test like this:
from src.main import *
import unittest
class WidgetTestCase(unittest.TestCase):
def setUp(self):
pass
if __name__ == '__main__':
unittest.main()
I get an error AttributeError: 'module' object has no attribute 'test_basic'
However, If I remove the line from src.main import * and run the test using python -m test.test_basic , it works perfectly fine. Is there a problem with the imports ?
Upon running python -c "from src.main import *"
I noticed this error
File "<string>", line 1, in <module>
File "src/main.py", line 1, in <module>
from dotenv import load_dotenv
ImportError: No module named dotenv ```

How to import a class from another file in python?

Im new to python and have looked at various stack overflow posts. i feel like this should work but it doesnt. How do you import a class from another file in python?
This folder structure
src/example/ClassExample
src/test/ClassExampleTest
I have this class
class ClassExample:
def __init__(self):
pass
def helloWorld(self):
return "Hello World!"
I have this test class
import unittest
from example import ClassExample
class ClassExampleTest(unittest.TestCase):
def test_HelloWorld(self):
hello = ClassExample()
self.assertEqual("Hello World!", hello.helloWorld())
if __name__ == '__main__':
unittest.main()
When the unit test runs the object is None:
AttributeError: 'NoneType' object has no attribute 'helloWorld'
What's wrong with this? How do you import a class in python?
If you're using Python 3, then imports are absolute by default. This means that import example will look for an absolute package named example, somewhere in the module search path.
So instead, you probably want a relative import. This is useful when you want to import a module that is relative the module doing the importing. In this case:
from ..example.ClassExample import ClassExample
I'm assuming that your folders are Python packages, meaning that they contain __init__.py files. So your directory structure would look like this.
src
|-- __init__.py
|-- example
| |-- __init__.py
| |-- ClassExample.py
|-- test
| |-- __init__.py
| |-- ClassExampleTest.py

Python Flask session variables aren't recognized by imported functions

My Flask application.py was getting a bit large and I wanted to spread the content (classes/functions) across a couple of additional py files. I placed class definitions in an appClasses.py and that is imported fine by application.py using
from appClasses import *
And, I placed some function definitions into appFunction.py and import those into application.py using
from appFunction import *
The functions in appFunction.py make use of flask session variables. When the same functions are in application.py there is no issue with session variable references (and app behaves as expected) but as soon as I cut/paste into appFunction.py and import functions I get a name error exception when the first function using session vars is called:
NameError: name 'session' is not defined
Here's a sample function:
def load_hgw_dict():
print("LOAD HGW DICT: called")
if 'hgw_dict_loaded' in session:
print("LOAD HGW DICT: hgw dict is available")
hgw_dict=session['hgw_dict']
else:
print("LOAD HGW DICT: hgw dict not available...adding")
session['hgw_dict_loaded']=True
list_of_hgws="/home/stbweb/LIST_OF_HENBGW"
hgw_file = open(list_of_hgws, "r")
hgw_dict=dict()
for line in hgw_file:
hgw_common, hgw_ip, hgw_hostname = line.split()
hgw_dict[hgw_common]= { 'ip':hgw_ip, 'hostname':hgw_hostname }
hgw_file.close()
print("LOAD HGW DICT: hgw dict =", hgw_dict)
return hgw_dict
Session is created by flask itself when flask application.py is run.
Here's the project dir structure...some files removed for compactness
.
|-- appClasses.py
|-- appHeNBGW.py
|-- application.ini
|-- application.py
|-- application.pyc
|-- cert.pem
|-- flask_session
| -- 2029240f6d1128be89ddc32729463129
|-- __init.py__
|-- key.pem
|-- __pycache__
|<snip>
|-- README.md
|-- set_dev_env
|-- static |<snip>
|-- templates
| <snip>
-- wsgi.py

Error when execute query to create function from file in python flask sqlalchemy

I try to execute sql file to create postgresql function in my database. But I have error.
File "/home/saputra/Documents/Development/Exercises/Python/flask/ujicoba06/smartapps/__init__.py", line 2, in <module>
from smartapps.app import create_app
File "/home/saputra/Documents/Development/Exercises/Python/flask/ujicoba06/smartapps/app.py", line 30, in <module>
from smartapps.commands import func_db
File "/home/saputra/Documents/Development/Exercises/Python/flask/ujicoba06/smartapps/commands/func_db.py", line 18
while current_app.open_resource("commands/func_query.sql") as f:
^
SyntaxError: invalid syntax
My structure:
.
|-- smartapps
| |-- app.py
| |-- commands
| | |-- func_db.py
| | |-- func_query.sql
| | |-- init_db.py
| | |-- __init__.py
func_query.sql
--function to get staff who don't have user
CREATE OR replace FUNCTION get_staff_not_have_user()
RETURNS TABLE (
real_name varchar,
email varchar
)
AS $$
BEGIN
RETURN query SELECT
st.real_name,
st.email
FROM
public.staff st
LEFT JOIN
public.users us on us.email = st.email
WHERE
us.email IS NULL;
END;
$$ LANGUAGE 'plpgsql';
func_db.py
# -*- coding: utf-8 -*-
import os
import click
from flask import current_app
from flask.cli import with_appcontext
from sqlalchemy.sql import text
from smartapps.extensions import db
def init_func():
"""Clear existing function and create new one"""
while current_app.open_resource("commands/func_query.sql") as f:
db.session.execute(f.read().decode("utf8"))
#click.command("init-func")
#with_appcontext
def func_query_command():
"""Execute sql script"""
init_func()
click.echo('Initialized function for SMART Systems.')
def init_app(app):
app.cli.add_command(func_query_command)
When I run from terminal, I got error.
How to correct code to execute query from file to create function in postgresql database.
Thanks
The syntax error is due to the wrong keyword in the line:
while current_app.open_resource("commands/func_query.sql") as f:
The first word should be with not while.
This the correct code for func_db.py. I add db.session.commit() after db.session.execute(). This I share the solve code.
func_db.py
import os
import click
from flask import current_app
from flask.cli import with_appcontext
from smartapps.extensions import db
def init_func():
"""Clear existing function and create new one"""
with current_app.open_resource("commands/func_query.sql") as f:
db.session.execute(f.read().decode("utf8"))
db.session.commit()
#click.command("init-func")
#with_appcontext
def func_query_command():
"""Execute sql script"""
init_func()
click.echo('Initialized function for SMART Systems.')
def init_app(app):
app.cli.add_command(func_query_command)
I hope this can save someone one day.
Happy coding!

How to test my scrapy method with unitest class

I want to try some method in my spider.
For example in my project, I have this schema:
toto/
├── __init__.py
├── items.py
├── pipelines.py
├── settings.py
├── spiders
│ ├── __init__.py
│ └── mySpider.py
└── Unitest
└── unitest.py
my unitest.py look like that:
# -*- coding: utf-8 -*-
import re
import weakref
import six
import unittest
from scrapy.selector import Selector
from scrapy.crawler import Crawler
from scrapy.utils.project import get_project_settings
from unittest.case import TestCase
from toto.spiders import runSpider
class SelectorTestCase(unittest.TestCase):
sscls = Selector
def test_demo(self):
print "test"
if __name__ == '__main__':
unittest.main()
and my mySpider.py, look like that:
import scrapy
class runSpider(scrapy.Spider):
name = 'blogspider'
start_urls = ['http://blog.scrapinghub.com']
def parse(self, response):
for url in response.css('ul li a::attr("href")').re(r'.*/\d\d\d\d/\d\d/$'):
yield scrapy.Request(response.urljoin(url), self.parse_titles)
def parse_titles(self, response):
for post_title in response.css('div.entries > ul > li a::text').extract():
yield {'title': post_title}
In my unitest.py file, How I can call my spider ?
I tried to add from toto.spiders import runSpider in my unitest.py file, but but it does not...
I've got this error:
Traceback (most recent call last): File "unitest.py", line 10, in
from toto.spiders import runSpider ImportError: No module named toto.spiders
How I can fix It?
Try:
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(os.path.realpath(__file__)), '../..')) #2 folder back from current file
from spiders.mySpider import runSpider

Categories

Resources