I have a very simple page spider that crawls for words on a given page and stores the count of the words in a SQLite data base. Although, the code exits with the exit code 0, the database won't update with any entries.
I don't know if I'm just snow blind or there's something inherently wrong with my code.
Here's the structure of the project and the code:
spider.py
input.txt
words.db
utilities (folder):
url_utilities.py
database_utilities.py
spider.py
import argparse
from utilities import url_utilities, database_utilities
def main(database: str, url_list_file: str):
big_word_list = []
urls = url_utilities.load_urls_from_file(url_list_file)
for url in urls:
print(f"Reading {url}")
page_content = url_utilities.load_page(url=url)
words = url_utilities.scrape_page(page_contents=page_content)
big_word_list.extend(words)
# database code
path = "C:\\Users\\baduker\\PycharmProjects\\page_spider\\words.db"
database_utilities.create_database(database_path=path)
database_utilities.save_words_to_database(database_path=path, words_list=big_word_list)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-db", "--database", help="SQLite File Name")
parser.add_argument("-i", "--input", help="File with urls")
args = parser.parse_args()
database_file = args.database
input_file = args.input
main(database=database_file, url_list_file=input_file)
url_utilities.py
import re
import string
from urllib.request import urlopen
from bs4 import BeautifulSoup
def load_urls_from_file(file_path: str):
try:
with open("input.txt") as f:
content = f.readlines()
return content
except FileNotFoundError:
print(f"The file {file_path} could not be found.")
exit(2)
def load_page(url: str):
response = urlopen(url)
html = response.read().decode("utf-8")
return html
def scrape_page(page_contents: str):
chicken_noodle = BeautifulSoup(page_contents, "html.parser")
for script in chicken_noodle(["script", "style"]):
script.extract()
text = chicken_noodle.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
plain_text = ''.join(filter(lambda x: x in string.printable, text))
clean_words = []
words = plain_text.split(" ")
for word in words:
clean = True
for punctuation_marks in string.punctuation:
if punctuation_marks in word:
clean = False
if any(char.isdigit() for char in word):
clean = False
# at least two characters but no more than 10
if len(word) < 2 or len(word) > 10:
clean = False
if not re.match(r'^\w+$', word):
clean = False
if clean:
try:
clean_words.append(word.lower())
except UnicodeEncodeError:
print(".")
return clean_words
database_utilities.py
import sqlite3 as lite
def create_database(database_path: str):
conn = lite.connect(database_path)
with conn:
cur = conn.cursor()
cur.execute("drop table if exists words")
ddl = "create table words (word text not null primary key, usage_count int default 1 not null);"
cur.execute(ddl)
ddl = "create unique index words_word_uindex on words (word);"
cur.execute(ddl)
conn.close()
def save_words_to_database(database_path: str, words_list: list):
conn = lite.connect(database_path)
with conn:
cur = conn.cursor()
for word in words_list:
sql = "select count(word) from words where word='" + word + "';"
cur.execute(sql)
count = cur.fetchone()[0]
if count > 0:
sql = "update words set usage_count = usage_count + 1 where word='" + word + "';"
else:
sql = "insert into words(word) values ('" + word + "');"
cur.execute(sql)
conn.commit()
conn.close()
print(f"Database save complete!")
input.txt
https://en.wikipedia.org/wiki/Python_(programming_language)
https://en.wikipedia.org/wiki/Guido_van_Rossum
https://en.wikipedia.org/wiki/Benevolent_dictator_for_life
Your code seems to be working.
I suspect you have permissions issue with the database file.
Make sure this line points to a folder where you have permissions to write:
path = "C:\\Users\\baduker\\PycharmProjects\\page_spider\\words.db"
or just remove the path and see if it works.
path = "words.db"
Ur context manager i.e. with
with con:
Hope u should commit before closing it. I mean u should commit in that with block itself.
U should do that in your database utility file.
Related
I'm trying to create a personal wiki that instead of searching info from a wikipedia API with the summary function, it search my own info that is related to a key in a file (key-value) that I create. What should I do to make it work?
def _(text):
try:
textarea.delete('1.0', END)
text = text.lower()
value = wikipedia.summary(text, sentences=15)
value = value + '\n\n'
for word in value:
textarea.insert(END, word)
except wikipedia.DisambiguationError as e:
s = random.choice(e.options)
s = s.lower()
p = wikipedia.summary(s)
p = p + '\n\n'
for word in p:
textarea.insert(END, word)
Thank you in advance!
here is the error
[Running] python -u "/Users/thomasmciver/Desktop/chat/ai-chat.py"
File "/Users/thomasmciver/Desktop/chat/ai-chat.py", line 38
with open ('Users/thomasmciver/Desktop/chat/RC_2015-01.txt' buffering=1000) as f:
^
SyntaxError: invalid syntax
[Done] exited with code=1 in 0.051 seconds
here is my code
# This Python file uses the following encoding: utf-8
import sqlite3
import json
from datetime import datetime
timeframe = '2015-05'
sql_transaction = []
connection = sqlite3.connect('{}.db'.format(timeframe))
c = connection.cursor()
def create_table():
c.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")
def format_data(data):
data = data.replace('\n',' newlinechar ').replace('\r',' newlinechar ').replace('"',"'")
return data
def find_parent(pid):
try:
sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else: return False
except Exception as e:
#print(str(e))
return False
if __name__ == '__main__':
create_table()
row_counter = 0
paired_rows = 0
with open ('Users/thomasmciver/Desktop/chat/RC_2015-01.txt' buffering=1000) as f:
for row in f:
row_counter += 1
row = json.loads(row)
parent_id = row['parent_id']
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
comment_id = row['name']
subreddit = row['subreddit']
parent_data = find_parent(parent_id)
if __name__ == '__main__':
create_table()
row_counter = 0
paired_rows = 0
# maybe check for a child, if child, is our new score superior? If so, replace. If not...
if score >= 2:
existing_comment_score = find_existing_score(parent_id)
You mean
with open('Cloud Drive/Desktop/chat/RC_2015-01.txt', buffering=1000) as f:
instead of
with open ('Cloud Drive/Desktop/chat/RC_2015-01.txt' (buffering=1000)) as f:
What you had is not valid syntax.
My file path is
C:/Users/Ratul/Downloads/Machine_Learning_Data/reddit_data/reddit_data/
There are many folders in that directory. I need to look through those directories and open files that starts with 'RC_'
Here's my code:
import sqlite3
import json
import os
from datetime import datetime
timeframe = '2015-05'
sql_transaction = []
connection = sqlite3.connect('{}.db'.format(timeframe))
c = connection.cursor()
def create_table():
c.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")
def format_data(data):
data = data.replace('\n',' newlinechar ').replace('\r',' newlinechar ').replace('"',"'")
return data
def find_parent(pid):
try:
sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else: return False
except Exception as e:
#print(str(e))
return False
if __name__ == '__main__':
create_table()
row_counter = 0
paired_rows = 0
with open('C:/Users/Ratul/Downloads/Machine_Learning_Data/reddit_data/reddit_data/{}/RC_{}'.format(timeframe.split('-')[0],timeframe), buffering=1000) as f:
for row in f:
row_counter += 1
row = json.loads(row)
parent_id = row['parent_id']
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
comment_id = row['name']
subreddit = row['subreddit']
parent_data = find_parent(parent_id)
# maybe check for a child, if child, is our new score superior? If so, replace. If not...
if score >= 2:
existing_comment_score = find_existing_score(parent_id)
But it seems there is some mistake in the path. I get an error
Traceback (most recent call last): File
"C:/Users/Ratul/AppData/Local/Programs/Python/Python37/test02.py",
line 36, in
with open('C:/Users/Ratul/Downloads/Machine_Learning_Data/reddit_data/reddit_data/{}/RC_{}'.format(timeframe.split('-')[0],timeframe), buffering=1000) as f: FileNotFoundError: [Errno 2] No such file or
directory:
'C:/Users/Ratul/Downloads/Machine_Learning_Data/reddit_data/reddit_data/2015/RC_2015-05'
I'm not sure what wrong I did there. Please help.
Use How to debug small programs (#1) and
print('C:/Users/Ratul/Downloads/Machine_Learning_Data/reddit_data/reddit_data/{}/RC_{}'.format(
timeframe.split('-')[0],timeframe))
instead of open. Check if all exists - because for some of your values it does not exist. Hence the error.
If most of your files exist, it is far easier to handle the error itself:
myname = 'C:/Users/Ratul/Downloads/Machine_Learning_Data/reddit_data/reddit_data/{}/RC_{}'.format(timeframe.split('-')[0],timeframe)
try:
with open(myname, buffering=1000) as f:
for row in f:
row_counter += 1
row = json.loads(row)
parent_id = row['parent_id']
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
comment_id = row['name']
subreddit = row['subreddit']
parent_data = find_parent(parent_id)
# maybe check for a child, if child, is our new score superior? If so, replace. If not...
if score >= 2:
existing_comment_score = find_existing_score(parent_id)
except FileNotFoundError as fnfError:
print(myname)
print(fnfError)
The open() command does not care about you using \ or / - if using \ you should escape it or use raw strings (aka: r'C:\some\dir\file.txt') - your syntax is ok as is - open() will use the appropriate directory delimiters under windows even if you give it 'c:/somedir/file.txt'
Readup: About error handling
I hope the title wasn't too confusing, but you'll see what I meant by that in a bit. In the meantime, some backstory-- I'm working on a function that generates random usernames and passwords and writes them in a text file as username:password for another program that collects the username:password line as:
string = line.split(":")
username = string[0]
pwd = string[1]
Why does this matter? Well, when I run my function:
Code:
# To generate users and passwords for the password file:
"""
Usage: count-- how many accounts to generate
file-- where to dump the accounts
method-- dict is where it loops through words
and chooses random ones as users and passwords,
and brute (not implemented yet) is where it chooses
random characters and strings them together as users
and passwords.
users-- if you want any filled in users, put them in here.
passes-- if you want any filled in passes, put them in here.
"""
def genAccts(count, file, method="dict", users=[], passes=[]):
try:
f = open(file, "w")
if method == "dict":
dictionary = "Dictionary.txt"#input("[*] Dictionary file: ")
d = open(dictionary, "r")
words = d.readlines()
d.close()
accts = []
for b in range(0, count):
global user
global pwd
user = random.choice(words)
pwd = random.choice(words)
if b < len(users)-1:
user = users[b]
if b < len(passes)-1:
pwd = passes[b]
acct = [user, pwd]
accts.append(acct)
print("[+] Successfully generated",count,"accounts")
for acct in accts:
combined = acct[0]+":"+acct[1]
print(combined)
f.write(combined)
f.close()
print("[+] Successfully wrote",count,"accounts in",file+"!")
except Exception as error:
return str(error)
genAccts(50, "brute.txt")
In my password file brute.txt, I get an output like
quainter
:slightest
litany
:purples
reciprocal
:already
delicate
:four
and so I'm wondering why is a \n added after the username?
You can fix this by replacing:
words = d.readlines()
with:
words = [x.strip() for x in d.readlines()]
words = d.readlines()
The above function returns a list which contains each line as an item. Every word will contain \n character at the end. So to get the required output, you have to trim the white space characters for username.
user = random.choice(words).strip()
Above line will solve your issue!
Use this:
def genAccts(count, file, method="dict", users=[], passes=[]):
try:
f = open(file, "w")
if method == "dict":
dictionary = "Dictionary.txt"#input("[*] Dictionary file: ")
d = open(dictionary, "r")
words = d.readlines().strip()
d.close()
accts = []
for b in range(0, count):
global user
global pwd
user = random.choice(words)
pwd = random.choice(words)
if b < len(users)-1:
user = users[b]
if b < len(passes)-1:
pwd = passes[b]
acct = [user, pwd]
accts.append(acct)
print("[+] Successfully generated",count,"accounts")
for acct in accts:
combined = acct[0]+":"+acct[1]
print(combined)
f.write(combined)
f.close()
print("[+] Successfully wrote",count,"accounts in",file+"!")
except Exception as error:
return str(error)
genAccts(50, "brute.txt")
I'm currently developing a web crawler that works through a list of urls I have stored in a queue file, I need my Spider to scrape all words from these url pages before it moves onto the next link in the queue, I need a point in the right direction for setting it up so that web scraper compares to my common.txt to make sure the word isn't in there and if it isn't already in the list before adding it etc.
I had tried something like this with get_keywords in my spider.py but it isn't doing anything I may be missing something simple as I've been coding all day but anyway here is my code
Spider.py
from Gen_info import *
class Spider:
project_name = ''
queue_file = ''
crawled_file = ''
keyword_file = ''
queue = set()
crawled = set()
def __init__(self, project_name):
Spider.project_name = project_name
Spider.queue_file = Spider.project_name + '/Chrome_Hist.csv'
Spider.crawled_file = Spider.project_name + '/CrawledUrls.txt'
self.boot()
#self.crawl_page('First spider', Spider.queue)
# Creates directory and files for project on first run and starts the spider
#staticmethod
def boot():
create_project_dir(Spider.project_name)
create_files(Spider.project_name)
Spider.queue = file_to_set(Spider.queue_file)
Spider.crawled = file_to_set(Spider.crawled_file)
# Updates user display, fills queue and updates files
#staticmethod
def crawl_page(thread_name, page_url):
if page_url not in Spider.crawled:
print(thread_name + ' now crawling ' + page_url)
print('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled)))
Spider.queue.remove(page_url)
Spider.crawled.add(page_url)
Spider.update_files()
#staticmethod
def update_files():
set_to_file(Spider.queue, Spider.queue_file)
set_to_file(Spider.crawled, Spider.crawled_file)
#staticmethod
def get_keywords(Page_words):
common = open("Common_words.txt").read().split('\n')
word_dict = {}
word_list = Page_words.lower().split()
for word in word_list:
if word not in common and word.isalnum():
if word not in word_dict:
word_dict[word] = 1
if word in word_dict:
word_dict[word] += 1
main.py
import threading
from Queue import Queue
from Spider import Spider
from Gen_info import *
import urllib2
from bs4 import BeautifulSoup
from shutil import copyfile
import os
PROJECT_NAME = 'History Forensics'
QUEUE_FILE = PROJECT_NAME + '/Chrome_Hist.csv'
CRAWLED_FILE = PROJECT_NAME + '/CrawledUrls.txt'
NUMBER_OF_THREADS = 2
Queue = Queue()
Spider(PROJECT_NAME)
keywords = ''
src = 'C:\Users\Lewis Collins\Python Project\ChromeDBs\Chrome_Hist.csv'
dst = PROJECT_NAME
path = 'C:\Users\Lewis Collins\Python Project\ChromeDBs\Chrome_Hist.csv'
# Create worker threads (will die when main exits)
def create_workers():
for _ in range(NUMBER_OF_THREADS):
t = threading.Thread(target=work)
t.daemon = True
t.start()
# Do the next job in the queue
def work():
while True:
url = Queue.get()
Spider.crawl_page(threading.current_thread().name, url)
Queue.task_done()
# Each queued link is a new job
def create_jobs():
for link in file_to_set(QUEUE_FILE):
Queue.put(link)
Queue.join()
crawl()
# Check if there are items in the queue, if so crawl them
def crawl():
queued_links = file_to_set(QUEUE_FILE)
if len(queued_links) > 0:
print(str(len(queued_links)) + ' links in the queue')
create_jobs()
def get_keywords():
common_words = open('File_Storage/common.txt', 'r').readlines()
keywords=open(PROJECT_NAME + '/keywords.txt', 'r').read().split('\n')
f = open(PROJECT_NAME + '/keywords.txt', 'a')
urls = file_to_set(QUEUE_FILE)
Hist_queue = urls
for i in Hist_queue:
html_content = urllib2.urlopen(i).read()
soup = BeautifulSoup(html_content)
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
(text.encode('utf-8'))
visible_text = soup.getText()
words = visible_text.split(' ')
for word in words:
if word not in common_words and word not in keywords and word.isalnum():
f.write(word + '\n')
keywords.append(word)
else:
continue
#copyfile(src, dst)
#
# os.remove(path)
create_workers()
get_keywords()
crawl()
Any questions about how it works fire away or any other code you may need to see
thanks in advance everyone
def get_keywords():
common_words = open('File_Storage/common.txt', 'r').readlines()
keywords=open(PROJECT_NAME + '/keywords.txt', 'r').read().split('\n')
f = open(PROJECT_NAME + '/keywords.txt', 'a')
urls = file_to_set(QUEUE_FILE)
Hist_queue = urls
for i in Hist_queue:
html_content = urllib2.urlopen(i).read()
soup = BeautifulSoup(html_content)
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
(text.encode('utf-8'))
visible_text = soup.getText()
words = visible_text.split(' ')
for word in words:
if word not in common_words and word not in keywords and word.isalnum():
f.write(word + '\n')
keywords.append(word)
else:
continue