I want to try some method in my spider.
For example in my project, I have this schema:
toto/
├── __init__.py
├── items.py
├── pipelines.py
├── settings.py
├── spiders
│ ├── __init__.py
│ └── mySpider.py
└── Unitest
└── unitest.py
my unitest.py look like that:
# -*- coding: utf-8 -*-
import re
import weakref
import six
import unittest
from scrapy.selector import Selector
from scrapy.crawler import Crawler
from scrapy.utils.project import get_project_settings
from unittest.case import TestCase
from toto.spiders import runSpider
class SelectorTestCase(unittest.TestCase):
sscls = Selector
def test_demo(self):
print "test"
if __name__ == '__main__':
unittest.main()
and my mySpider.py, look like that:
import scrapy
class runSpider(scrapy.Spider):
name = 'blogspider'
start_urls = ['http://blog.scrapinghub.com']
def parse(self, response):
for url in response.css('ul li a::attr("href")').re(r'.*/\d\d\d\d/\d\d/$'):
yield scrapy.Request(response.urljoin(url), self.parse_titles)
def parse_titles(self, response):
for post_title in response.css('div.entries > ul > li a::text').extract():
yield {'title': post_title}
In my unitest.py file, How I can call my spider ?
I tried to add from toto.spiders import runSpider in my unitest.py file, but but it does not...
I've got this error:
Traceback (most recent call last): File "unitest.py", line 10, in
from toto.spiders import runSpider ImportError: No module named toto.spiders
How I can fix It?
Try:
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(os.path.realpath(__file__)), '../..')) #2 folder back from current file
from spiders.mySpider import runSpider
Related
I am trying to run the unit test using pytest in this project, here main_0.py is importing s3 file.
I am getting ModuleNotFoundError: no module named 's3'
Project Folder Structure
some_project
└───src
├───main
│ └───lambda_function
│ └───some
│ main_0.py
│ s3.py
│
└───test
└───unittest
└───lambda_function
└───some
test_main_0.py
test_s3.py
main_0.py
from s3 import PrintS3
def lambda_handler():
obj = PrintS3()
res = obj.print_txt()
return res
s3.py
class PrintS3:
def __init__(self) -> None:
self.txt = "Hello"
def print_txt(self):
print(self.txt)
return self.txt
test_main_0.py
import unittest
class TestSomeMain(unittest.TestCase):
def test_main_0(self):
from src.main.lambda_function.some.main_0 import lambda_handler
res = lambda_handler()
assert res == "Hello"
test_s3.py is empty.
I also tried adding an empty __init__.py file in both the dir but still the same error
Project Folder Structure after adding __init__.py file
some_project
└───src
├───main
│ └───lambda_function
│ └───some
│ main_0.py
│ s3.py
│ __init__.py
│
└───test
└───unittest
└───lambda_function
└───some
test_main_0.py
test_s3.py
__init__.py
the command I am using to run pytest:
python -m pytest ./src/test
and I am inside some_project folder and also using main_0.py instead of main.py because to not get confused with main folder
Edit 2:
I am to run the test case successfully by adding sys.path in the test_main_0.py file but it is breaking linting and hinting in the code editor (vscode) it didn't broke the linting and hinting, both import statement works but is there any better way.
new test_main_0.py:
import unittest
import os
import sys
sys.path.append(os.path.abspath("./src/main/lambda_function/some/"))
class TestSomeMain(unittest.TestCase):
def test_main_0(self):
from src.main.lambda_function.some.main_0 import lambda_handler # this works
from main_0 import lambda_handler # this also works but break linting and hinting in the code editor
res = lambda_handler()
assert res == "Hello"
could you please try
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from some.s3 import PrintS3
def lambda_handler():
obj = PrintS3()
res = obj.print_txt()
return res
I found a somewhat working solution.
added setUp() and tearDown() methods in the class for inserting and removing path in sys.path
path in sys.path is the location of the directory where the main_0.py and s3.py is located
import unittest
import os
import sys
class TestSomeMain(unittest.TestCase):
def setUp(self) -> None:
sys.path.insert(0, os.path.abspath("./src/main/lambda_function/some/"))
def tearDown(self) -> None:
sys.path.remove(os.path.abspath("./src/main/lambda_function/some/"))
def test_main_0(self):
from src.main.lambda_function.some.main_0 import lambda_handler
res = lambda_handler()
assert res == "Hello"
also update the test command in the terminal:
python -m pytest ./src/test/unittest/lambda_function/some --cov ./src/main/lambda_function/some --cov-report html
I have this structure:
├── app
│ ├── __init__.py
│ └── views.py
├── requirements.txt
├── sources
│ └── passport
│ ├── field_mapping.
│ ├── listener.py
│ ├── main.py
this is my init file:
from flask import Flask
app = Flask(__name__)
from app import views
my views file. Is this the best way to send plain text?
from app import app
from flask import Response
from sources.app_metrics import meters
# from sources.passport.main import subscription_types
#app.route('/metrics')
def metrics():
def generateMetrics():
metrics = ""
for subscription in ["something", "some other thing"]:
metrics += "thing_{}_count {}\n".format(subscription, meters[subscription].get()['count'])
return metrics
print(generateMetrics())
return Response(generateMetrics(), mimetype='text/plain')
My sources/passport/main file looks like this:
subscription_types = ["opportunity", "account", "lead"]
if __name__ == "__main__":
loop = asyncio.get_event_loop()
...
for subscription in subscription_types():
I also ran export FLASK_ENV=app/__init__.py before running flask app
When I visit /metrics I get an error that looks like some kind of circular dependency error.
When I uncomment that import comment in my views, file, the error occurs.
Pulling out subscription_types into a variable and importing it seems to be causing the problem.
My stack trace:
File "/usr/local/lib/python3.7/site-packages/flask/cli.py", line 235, in locate_app
__import__(module_name)
File "/Users/jwan/extract/app/__init__.py", line 5, in <module>
from app import views
File "/Users/jwan//extract/app/views.py", line 5, in <module>
from sources.passport.main import subscription_types
File "/Users/jwan/extract/sources/passport/main.py", line 3, in <module>
from sources.passport.listener import subscribe, close_subscriptions
File "/Users/jwan/extract/sources/passport/listener.py", line 18, in <module>
QUEUE = boto3.resource("sqs").get_queue_by_name(QueueName=CONFIG["assertions_queue"][ENV])
botocore.errorfactory.QueueDoesNotExist: An error occurred (AWS.SimpleQueueService.NonExistentQueue) when calling the GetQueueUrl operation: The specified queue does not exist for this wsdl versio
My sources/passport/listener file has this on line 18:
import gzip
import log
from os import getenv
from sources.passport.normalizer import normalize_message
from sources.app_metrics import meters
QUEUE = boto3.resource("sqs").get_queue_by_name(QueueName=CONFIG["assertions_queue"][ENV])
I am working on writing some data tests. Super simple nothing crazy.
Here is what my current directory looks like.
.
├── README.md
├── hive_tests
│ ├── __pycache__
│ ├── schema_checks_hive.py
│ ├── test_schema_checks_hive.py
│ └── yaml
│ └── job_output.address_stats.yaml
└── postgres
├── __pycache__
├── schema_checks_pg.py
├── test_schema_checks_pg.py
└── yaml
When I cd in to postgres and run pytest all my tests pass.
When I cd in to hive_test and run pytest I am getting an import error.
Here is my schema_checks_hive.py file.
from pyhive import hive
import pandas as pd
import numpy as np
import os, sys
import yaml
def check_column_name_hive(schema, table):
query = "DESCRIBE {0}.{1}".format(schema, table)
df = pd.read_sql_query(query, conn)
print(df.columns)
return df.columns
check_column_name_hive('myschema', 'mytable')
Here is my test_schema_checks_hive.py file where the tests are located.
import schema_checks_hive as sch
import pandas as pd
import yaml
import sys, os
def test_column_names_hive():
for filename in os.listdir('yaml'):
data = ""
with open("yaml/{0}".format(filename), 'r') as stream:
data = yaml.safe_load(stream)
schema = data['schema']
table = data['table']
cols = data['columns']
df = sch.check_column_name_hive(schema, table)
assert len(cols) == len(df)
assert cols == df.tolist()
When I run Pytest I get an error that says:
mportError while importing test module '/Usersdata/
tests/hive_tests/test_schema_checks_hive.py'.
Hint: make sure your test modules/packages have valid Python names.
Traceback:
test_schema_checks_hive.py:1: in <module>
import schema_checks_hive as sch
schema_checks_hive.py:1: in <module>
from pyhive import hive
E ModuleNotFoundError: No module named 'pyhive
I would love any help! Thanks so much.
this is my project structure (just an example to illustrate the problem):
.
├── hello_world
│ ├── __init__.py
│ └── some
│ └── very_nested
│ └── stuff.py
└── tests
└── test_stuff.py
The test_stuff.py file (for py.test):
from hello_world.some.very_nested.stuff import Magic
from hello_world.some.very_nested.other_stuff import MoreMagic
def test_magic_fact_works():
assert Magic().fact(3) == 6
# ...
Is there any way how to make the import lines shorter? They get too long in the real project.
For example, this would be nice, but it doesn't work :)
import hello_world.some.very_nested as vn
from vn.stuff import Magic
from vn.other_stuff import MoreMagic
I cannot use relative imports (I assume) beucase the tests are not inside the package. I could move them, but is it possible without changing project structure?
As #jonrsharpe says, you can aggregate your packages in a django style:
"""
Django validation and HTML form handling.
"""
from django.core.exceptions import ValidationError # NOQA
from django.forms.boundfield import * # NOQA
from django.forms.fields import * # NOQA
from django.forms.forms import * # NOQA
from django.forms.formsets import * # NOQA
from django.forms.models import * # NOQA
from django.forms.widgets import * # NOQA
And in your subpackage, eg.: django.forms.widgets add this:
__all__ = (
'Media', 'MediaDefiningClass', 'Widget', 'TextInput', 'NumberInput',
'EmailInput', 'URLInput', 'PasswordInput', 'HiddenInput',
'MultipleHiddenInput', 'FileInput', 'ClearableFileInput', 'Textarea',
'DateInput', 'DateTimeInput', 'TimeInput', 'CheckboxInput', 'Select',
'NullBooleanSelect', 'SelectMultiple', 'RadioSelect',
'CheckboxSelectMultiple', 'MultiWidget', 'SplitDateTimeWidget',
'SplitHiddenDateTimeWidget', 'SelectDateWidget',
)
To specify which items you want to import when using import *, this way you can organize your packages as deep as you want and keep them accessible at the same time.
In your case it would be something like:
hello_world/__init__.py
from hello_world.some.very_nested.stuff import *
from hello_world.some.very_nested.other_stuff import *
SO when importing your packages in tests for instance, you get this: from hello_world import Magic
Put into hello_world/__init__.py:
from __future__ import absolute_import
from .some import *
Into hello_world/some/__init__.py:
from __future__ import absolute_import
from .very_nested import *
Into hello_world/some/very_nested/__init__.py:
from __future__ import absolute_import
from .stuff import Magic
from .other_stuff import MoreMagic
So if hello_world/some/very_nested/stuff.py contains:
class Magic:
pass
And hello_world/some/very_nested/other_stuff.py contains:
class OtherMagic:
pass
Then you can easy import it. tests/test_stuff.py:
import pytest
from hello_world import Magic
from hello_world import MoreMagic
#pytest.mark.parametrize('cls', [Magic, MoreMagic])
def test_magic(cls):
assert cls()
I have an example of a scrapy project. it is pretty much default. its folder structure:
craiglist_sample/
├── craiglist_sample
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ ├── test.py
└── scrapy.cfg
When you write scrapy crawl craigs -o items.csv -t csv to windows command prompt it writes craiglist items and links to console.
I want to create an example.py in main folder and print these items to python console inside it.
I tried
from scrapy import cmdline
cmdline.execute("scrapy crawl craigs".split())
but it writes the same as windows shell output. How can I make it print only items and list?
test.py
:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craiglist_sample.items import CraiglistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
## allowed_domains = ["sfbay.craigslist.org"]
## start_urls = ["http://sfbay.craigslist.org/npo/"]
allowed_domains = ["craigslist.org"]
start_urls = ["http://sfbay.tr.craigslist.org/search/npo?"]
##search\/npo\?s=
rules = (Rule (SgmlLinkExtractor(allow=('s=\d00',),restrict_xpaths=('//a[#class="button next"]',))
, callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//span[#class="pl"]')
## titles = hxs.select("//p[#class='row']")
items = []
for titles in titles:
item = CraiglistSampleItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return(items)
An approach could be turning off the default shell output of scrapy and insert a print command inside your parse_items function.
1 - Turn off the debug level in file settings.py
LOG_ENABLED = False
Documentation about logging levels in Scrapy here: http://doc.scrapy.org/en/latest/topics/logging.html
2 - Add a print command for the items you are interested
for titles in titles:
item = CraiglistSampleItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
print item ["title"], item ["link"]
The shell output will be:
[u'EXECUTIVE ASSISTANT'] [u'/eby/npo/4848086929.html']
[u'Direct Support Professional'] [u'/eby/npo/4848043371.html']
[u'Vocational Counselor'] [u'/eby/npo/4848042572.html']
[u'Day Program Supervisor'] [u'/eby/npo/4848041846.html']
[u'Educational Specialist'] [u'/eby/npo/4848040348.html']
[u'ORGANIZE WITH GREENPEACE - Grassroots Nonprofit Job!']
[u'/eby/npo/4847984654.html']
EDIT Code for executing from a script
import os
os.system('scrapy crawl craigs > log.txt')
There are several other ways for executing line program within python.
Check Executing command line programs from within python and Calling an external command in Python