Web scraping wordlists

Web scraping wordlists - python

I'm currently developing a web crawler that works through a list of urls I have stored in a queue file, I need my Spider to scrape all words from these url pages before it moves onto the next link in the queue, I need a point in the right direction for setting it up so that web scraper compares to my common.txt to make sure the word isn't in there and if it isn't already in the list before adding it etc.
I had tried something like this with get_keywords in my spider.py but it isn't doing anything I may be missing something simple as I've been coding all day but anyway here is my code
Spider.py
from Gen_info import *
class Spider:
project_name = ''
queue_file = ''
crawled_file = ''
keyword_file = ''
queue = set()
crawled = set()
def __init__(self, project_name):
Spider.project_name = project_name
Spider.queue_file = Spider.project_name + '/Chrome_Hist.csv'
Spider.crawled_file = Spider.project_name + '/CrawledUrls.txt'
self.boot()
#self.crawl_page('First spider', Spider.queue)
# Creates directory and files for project on first run and starts the spider
#staticmethod
def boot():
create_project_dir(Spider.project_name)
create_files(Spider.project_name)
Spider.queue = file_to_set(Spider.queue_file)
Spider.crawled = file_to_set(Spider.crawled_file)
# Updates user display, fills queue and updates files
#staticmethod
def crawl_page(thread_name, page_url):
if page_url not in Spider.crawled:
print(thread_name + ' now crawling ' + page_url)
print('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled)))
Spider.queue.remove(page_url)
Spider.crawled.add(page_url)
Spider.update_files()
#staticmethod
def update_files():
set_to_file(Spider.queue, Spider.queue_file)
set_to_file(Spider.crawled, Spider.crawled_file)
#staticmethod
def get_keywords(Page_words):
common = open("Common_words.txt").read().split('\n')
word_dict = {}
word_list = Page_words.lower().split()
for word in word_list:
if word not in common and word.isalnum():
if word not in word_dict:
word_dict[word] = 1
if word in word_dict:
word_dict[word] += 1
main.py
import threading
from Queue import Queue
from Spider import Spider
from Gen_info import *
import urllib2
from bs4 import BeautifulSoup
from shutil import copyfile
import os
PROJECT_NAME = 'History Forensics'
QUEUE_FILE = PROJECT_NAME + '/Chrome_Hist.csv'
CRAWLED_FILE = PROJECT_NAME + '/CrawledUrls.txt'
NUMBER_OF_THREADS = 2
Queue = Queue()
Spider(PROJECT_NAME)
keywords = ''
src = 'C:\Users\Lewis Collins\Python Project\ChromeDBs\Chrome_Hist.csv'
dst = PROJECT_NAME
path = 'C:\Users\Lewis Collins\Python Project\ChromeDBs\Chrome_Hist.csv'
# Create worker threads (will die when main exits)
def create_workers():
for _ in range(NUMBER_OF_THREADS):
t = threading.Thread(target=work)
t.daemon = True
t.start()
# Do the next job in the queue
def work():
while True:
url = Queue.get()
Spider.crawl_page(threading.current_thread().name, url)
Queue.task_done()
# Each queued link is a new job
def create_jobs():
for link in file_to_set(QUEUE_FILE):
Queue.put(link)
Queue.join()
crawl()
# Check if there are items in the queue, if so crawl them
def crawl():
queued_links = file_to_set(QUEUE_FILE)
if len(queued_links) > 0:
print(str(len(queued_links)) + ' links in the queue')
create_jobs()
def get_keywords():
common_words = open('File_Storage/common.txt', 'r').readlines()
keywords=open(PROJECT_NAME + '/keywords.txt', 'r').read().split('\n')
f = open(PROJECT_NAME + '/keywords.txt', 'a')
urls = file_to_set(QUEUE_FILE)
Hist_queue = urls
for i in Hist_queue:
html_content = urllib2.urlopen(i).read()
soup = BeautifulSoup(html_content)
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
(text.encode('utf-8'))
visible_text = soup.getText()
words = visible_text.split(' ')
for word in words:
if word not in common_words and word not in keywords and word.isalnum():
f.write(word + '\n')
keywords.append(word)
else:
continue
#copyfile(src, dst)
#
# os.remove(path)
create_workers()
get_keywords()
crawl()
Any questions about how it works fire away or any other code you may need to see
thanks in advance everyone

def get_keywords():
common_words = open('File_Storage/common.txt', 'r').readlines()
keywords=open(PROJECT_NAME + '/keywords.txt', 'r').read().split('\n')
f = open(PROJECT_NAME + '/keywords.txt', 'a')
urls = file_to_set(QUEUE_FILE)
Hist_queue = urls
for i in Hist_queue:
html_content = urllib2.urlopen(i).read()
soup = BeautifulSoup(html_content)
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
(text.encode('utf-8'))
visible_text = soup.getText()
words = visible_text.split(' ')
for word in words:
if word not in common_words and word not in keywords and word.isalnum():
f.write(word + '\n')
keywords.append(word)
else:
continue

Related

How to use threading to speed up my function?

I'm on working on my tool.
So I have this function :
import subprocess, os, platform, ctypes, requests, random, threading
from bs4 import BeautifulSoup as bs
temptotal = 0
totalurl = 0
retry = 0
load = 0
load2 = 0
loaded = 0
dorksdone = 0
tempourl = 0
#Import Proxy List
selecting = 1
while selecting == 1:
try:
option = int(input("Choose Type Proxy(1 = http, 2=socks4, 3 = socks5) :")
except:
option = 404
if option == 1:
selecting = 0
prox = 'http'
proxyyyy = 'http'
elif option == 2:
selecting = 0
prox = 'socks4'
proxyyyy = 'socks4'
elif option == 3:
selecting = 0
prox = 'socks5'
proxyyyy = 'socks5'
else:
print("Choose valid numbre such as 1, 2 or 3!")
proxy_list = input("Give me Proxylist :" )
with open(proxy_list, mode="r", encoding="utf-8") as mf:
for line in mf:
load2 += 1
print(" ")
print("Total Proxy loaded :" + str(load2))
print(" ")
#import keywordfile
dorkslist = input("Give me KeywordList/Dorklist :" + bcolors.ENDC + " ")
with open(dorkslist, mode="r", encoding="utf-8") as mf:
for line in mf:
load += 1
mf.close()
print(" ")
print("Total Dorks loaded:" + str(load))
print(" ")
#define url to check
yahoourl = {"https://fr.search.yahoo.com/search?p=&fr=yfp-search-sb",
"https://fr.search.yahoo.com/search?p=&fr=yfp-search-sb&b=11&pz=10"}
#funtion i want to speed up
def checker():
global temptotal
global loaded
global dorksdone
global tempourl
proxy = set()
with open(proxy_list, "r") as f:
file_lines1 = f.readlines()
for line1 in file_lines1:
proxy.add(line1.strip())
with open(dorkslist, mode="r",encoding="utf-8") as my_file:
for line in my_file:
loaded += 1
threading.Thread(target=titre).start()
indorks = line
encode = requote_uri(indorks)
for yahoo in yahoourl:
yahooo = yahoo.replace("&fr",encode + "&fr")
try:
proxies = {
'http': prox+'://'+random.choice(list(proxy))
}
r = requests.get(yahooo, proxies=proxies)
print("Dorks used :" + indorks )
dorksdone += 1
soup = bs(r.text, 'html.parser')
links = soup.find_all('a')
for link in soup.find_all('a'):
a = link.get('href')
unquote(a)
temptotal += 1
with open("Bing.txt", mode="a",encoding="utf-8") as fullz:
fullz.write(a + "\n")
fullz.close()
lines_seen = set() # holds lines already seen
outfile = open("Bingnodup.txt", "w", encoding="utf-8")
for line in open("Bing.txt", "r", encoding="utf-8"):
if line not in lines_seen: # not a duplicate
outfile.write(line)
lines_seen.add(line)
outfile.close()
with open("Bingnodup.txt", mode="r", encoding="utf-8") as cool:
for url in cool:
try:
proxies = {
'http': prox+'://'+random.choice(list(proxy))
}
response = requests.get(url, proxies=proxies)
save = response.url
with open("Bingtemp.txt", mode="a", encoding="utf-8") as cool1:
cool1.write(save + "\n")
tempourl += 1
cool1.close()
except:
pass
except:
raise
fin()
#start bot
bot1 = threading.Thread(target=checker)
bot1.start()
bot1.join()
Exemple file for Keyword:
python
wordpress
Exemple file for proxy(http so take 1 on choice) :
46.4.96.137:8080
223.71.167.169:80
219.248.205.117:3128
198.24.171.34:8001
51.158.123.35:9999
But this function when running is very very very slow, could who let me know how i can give boost to this function ?
Because i have try to use this topic: How can I use threading in Python?
But i didn't understand how to build in into the right way for my function.

Your script is what's called I/O bound. What this means is that it is not slow because the CPU needs to perform long computations, but because it needs to wait a lot every time it requests a URL (the bottleneck are the requests to the internet).
For concurrency you have 3 options:
asyncio
threading
multiprocessing
The first two are the ones which can help you in I/O bound problems like yours. The first one is the recommended approach in a problem like this, since there is a library available with support for async/await.
This is an adapted example from the above link, which does exactly what you need:
import asyncio
import time
import aiohttp
def get_proxies():
if platform.system() == "Linux":
clear = lambda: os.system('clear')
clear()
if platform.system() == "Windows":
clear = lambda: os.system('cls')
clear()
proxy = set()
with open("proxy.txt", "r") as f:
file_lines1 = f.readlines()
for line1 in file_lines1:
proxy.add(line1.strip())
return proxy
async def download_site(session, url, proxies):
async with session.get(url, proxies=proxies) as response:
save = response.url
with open("Yahootemp.txt", mode="a", encoding="utf-8") as cool1:
cool1.write(save + "\n")
async def download_all_sites(sites, proxies):
async with aiohttp.ClientSession() as session:
tasks = []
for url in sites:
task = asyncio.ensure_future(download_site(session, url, proxies))
tasks.append(task)
await asyncio.gather(*tasks, return_exceptions=True)
if __name__ == "__main__":
proxies = get_proxies()
proxies = {
'http': prox + '://' + random.choice(list(proxies))
}
sites = []
with open("Yahoonodup.txt", mode="r", encoding="utf-8") as cool:
for url in cool:
sites.append(url)
asyncio.get_event_loop().run_until_complete(download_all_sites(sites, proxies))
You could make it even faster if saving the files seems to still be too slow; read this.

full URL isn't being parsed correctly to a telegram bot

I have written the following to automate the parsing of text to a telegram bot from a .txt file that is continuously being updated.
import urllib.parse
import time
import requests
def post_to_telegram(msg):
print(msg)
base_url = 'https://api.telegram.org/bot&text="{}"'.format(msg)
requests.get(base_url)
urr = ""
name = ""
price = ""
ourLines=0
while(True):
file1 = open('example.txt', 'r')
Lines = file1.readlines()
time.sleep(1)
while(True):
if(ourLines==len(Lines)):
break
else:
txt = Lines[ourLines].strip()
tlist = txt.split("&")
ourLines=ourLines+1
for subtxt in tlist:
if "eventurl=" in subtxt:
a = subtxt[9:len(subtxt) - 3]
url = 'www.bbey43.com/#'+a.replace("%23", "/")
#print(url)
urr = url
elif "bet=" in subtxt:
name = urllib.parse.unquote(subtxt[4:len(subtxt)])
#print(name)
elif "price\":" in subtxt:
a = subtxt.split("price")[1]
price = a.split("\"")[2]
#print(price)
post_to_telegram(urr + " "+ name + " " + price)
the 'name' & 'price' is successfully posted to the bot, but the 'url' doesn't post correctly. The only thing that gets through is "bbey43.com/#/

The solution to this was rather simple in the end. As the "#" was apart of a URL it required special formatting when being parsed.
Simply adding %23 instead of # solved it.

List ouf of Range Error when working with CSV files

I am running a python scraper that scrapes quotes from a webpage and outputs the result into a CSV file.
I have not written this myself because I am a beginner, but as I was running this code to test it out and use parts of it myself I got this error. I know what the error means but I am pretty clueless how to approach to fix this. I would like to push an update to the github of the author to help.
Traceback (most recent call last):
File "quotes.py", line 100, in <module>
get_authors()
File "quotes.py", line 58, in get_authors
quote_details = fetch_quote(url)
File "quotes.py", line 77, in fetch_quote
tempString += ("\"%s\","%next(q.find_class('b-qt')[0].iter('a')).text)
IndexError: list index out of range
The problem happens when it starts to fetch the quotes. Creating a list of authors and a list of the URL's works without any issues. The IndexError happens after it creates the CSV file, thats the moment the error gets thrown so I assume the problem is with this part of the code:
tempString += ("\"%s\","%next(q.find_class('b-qt')[0].iter('a')).text)
Does this sound about right? I have absolutely no clue how to solve errors in Python beyond typeErrors and some of the more simpler IndexErrors. I would love to learn but all my searching on stackoverflow showed a lot of people with the same issue when it comes to CSV files. But all the answers were very specific.
#!/usr/bin/python
import requests
from lxml import html
import time
import string
def get_authors():
baseUrl = 'http://www.brainyquote.com'
urlString = 'http://www.brainyquote.com/authors/'
authorsUrl = [urlString + x for x in list(string.lowercase[:26])]
urlsList = [] # authors list page urls
print ""
print "Scanning Started for page links"
print ""
for url in authorsUrl:
print "Scanning URL: %s"%url
urlsList.append(url)
urlsList.extend(pagination(url, False))
authorsList = []
print ""
print "Scanning Started for Author Pages"
print ""
for url in urlsList:
print "Scanning URL: %s"%url
authorsList.extend(get_authors_links(url))
# Write all authors links
authorsFile = open("authors.txt","a+")
for urls in authorsList:
authorsFile.write(baseUrl + urls.encode('utf-8') + "\n")
authorsFile.close()
quoteLinks = []
# Write all authors links
print ""
print "Scanning Started for Quote Page Links"
print ""
for url in authorsList:
newUrl = (baseUrl + url)
print "Scanning URL: %s"%newUrl
quoteLinks.append(newUrl)
arr = pagination(newUrl, True)
quoteLinks.extend(arr)
# Write all quotes link
linksFile = open("quotes_links.txt","a+")
for url in quoteLinks:
linksFile.write(url.encode('utf-8') + "\n")
linksFile.close()
print ""
print "Scanning Started for fetching quotes"
print ""
# Write all quotes
quotesFile = open("quotes.csv","a+")
for url in quoteLinks:
quote_details = fetch_quote(url)
quotesFile.write(quote_details.encode('utf-8') + "\n")
print ""
print "All Done \nThanks for using it...!!!"
print ""
def get_authors_links(url):
page = requests.get(url)
tree = html.fromstring(page.text)
arr = tree.xpath('//table[#class="table table-hover table-bordered"]//td/a/#href')
return arr
def fetch_quote(url):
page = requests.get(url)
tree = html.fromstring(page.text)
quotes = tree.find_class('bqQt')
tempString = ""
for q in quotes:
tempString += ("\"%s\","%next(q.find_class('b-qt')[0].iter('a')).text)
tempString += ("%s,"%next(q.find_class('bq-aut')[0].iter('a')).text)
for element in q.find_class('oncl_k'):
tempString += "%s "%element.text
tempString += "\n"
return tempString
def pagination(url, htmlPage): # .html or not - htmlPage True or False
arr = []
page = requests.get(url)
tree = html.fromstring(page.text)
end = tree.xpath('//div[#class="row paginationContainer"]//nav//ul/li[last()-1]/a/text()')
if len(end):
if(htmlPage):
url = url.split('.html')[0]
for count in range(2, int(end[0])+1):
arr.append(url+"%s.html"%(count))
else:
for count in range(2, int(end[0])+1):
arr.append(url+"%s"%(count))
return arr
if __name__ == '__main__':
get_authors()#!/usr/bin/python
import requests
from lxml import html
import time
import string
def get_authors():
baseUrl = 'http://www.brainyquote.com'
urlString = 'http://www.brainyquote.com/authors/'
authorsUrl = [urlString + x for x in list(string.lowercase[:26])]
urlsList = [] # authors list page urls
print ""
print "Scanning Started for page links"
print ""
for url in authorsUrl:
print "Scanning URL: %s"%url
urlsList.append(url)
urlsList.extend(pagination(url, False))
authorsList = []
print ""
print "Scanning Started for Author Pages"
print ""
for url in urlsList:
print "Scanning URL: %s"%url
authorsList.extend(get_authors_links(url))
# Write all authors links
authorsFile = open("authors.txt","a+")
for urls in authorsList:
authorsFile.write(baseUrl + urls.encode('utf-8') + "\n")
authorsFile.close()
quoteLinks = []
# Write all authors links
print ""
print "Scanning Started for Quote Page Links"
print ""
for url in authorsList:
newUrl = (baseUrl + url)
print "Scanning URL: %s"%newUrl
quoteLinks.append(newUrl)
arr = pagination(newUrl, True)
quoteLinks.extend(arr)
# Write all quotes link
linksFile = open("quotes_links.txt","a+")
for url in quoteLinks:
linksFile.write(url.encode('utf-8') + "\n")
linksFile.close()
print ""
print "Scanning Started for fetching quotes"
print ""
# Write all quotes
quotesFile = open("quotes.csv","a+")
for url in quoteLinks:
quote_details = fetch_quote(url)
quotesFile.write(quote_details.encode('utf-8') + "\n")
print ""
print "All Done \nThanks for using it...!!!"
print ""
def get_authors_links(url):
page = requests.get(url)
tree = html.fromstring(page.text)
arr = tree.xpath('//table[#class="table table-hover table-bordered"]//td/a/#href')
return arr
def fetch_quote(url):
page = requests.get(url)
tree = html.fromstring(page.text)
quotes = tree.find_class('bqQt')
tempString = ""
for q in quotes:
tempString += ("\"%s\","%next(q.find_class('b-qt')[0].iter('a')).text)
tempString += ("%s,"%next(q.find_class('bq-aut')[0].iter('a')).text)
for element in q.find_class('oncl_k'):
tempString += "%s "%element.text
tempString += "\n"
return tempString
def pagination(url, htmlPage): # .html or not - htmlPage True or False
arr = []
page = requests.get(url)
tree = html.fromstring(page.text)
end = tree.xpath('//div[#class="row paginationContainer"]//nav//ul/li[last()-1]/a/text()')
if len(end):
if(htmlPage):
url = url.split('.html')[0]
for count in range(2, int(end[0])+1):
arr.append(url+"%s.html"%(count))
else:
for count in range(2, int(end[0])+1):
arr.append(url+"%s"%(count))
return arr
if __name__ == '__main__':
get_authors()
Any ideas or pointers would be much appreciated. From what I know this should not be hard to fix but as a beginner the idea to change 3 lines in longer code than I am used to is very daunting.
All credit to the author, I hope I can push a fix with your help:
https://github.com/ravingupta/brainyquote/

This works (code converted to python 3)
import requests
from lxml import html
import string
def get_authors():
baseUrl = 'http://www.brainyquote.com'
urlString = 'http://www.brainyquote.com/authors/'
authorsUrl = [urlString + x for x in list(string.ascii_lowercase[:26])]
urlsList = [] # authors list page urls
print("")
print("Scanning Started for page links")
print("")
for url in authorsUrl:
print("Scanning URL: %s" % url)
urlsList.append(url)
urlsList.extend(pagination(url, False))
authorsList = []
print("")
print("Scanning Started for Author Pages")
print("")
for url in urlsList:
print("Scanning URL: %s" % url)
authorsList.extend(get_authors_links(url))
# Write all authors links
authorsFile = open("authors.txt", "a+")
for urls in authorsList:
authorsFile.write(baseUrl + str(urls.encode('utf-8')) + "\n")
authorsFile.close()
quoteLinks = []
# Write all authors links
print("")
print("Scanning Started for Quote Page Links")
print("")
for url in authorsList:
newUrl = (baseUrl + url)
print("Scanning URL: %s" % newUrl)
quoteLinks.append(newUrl)
arr = pagination(newUrl, True)
quoteLinks.extend(arr)
# Write all quotes link
linksFile = open("quotes_links.txt", "a+")
for url in quoteLinks:
linksFile.write(str(url.encode('utf-8')) + "\n")
linksFile.close()
print("")
print("Scanning Started for fetching quotes")
print("")
# Write all quotes
quotesFile = open("quotes.csv", "a+")
for url in quoteLinks:
quote_details = fetch_quote(url)
quotesFile.write(str(quote_details.encode('utf-8')) + "\n")
print("")
print("All Done \nThanks for using it...!!!")
print("")
def get_authors_links(url):
page = requests.get(url)
tree = html.fromstring(page.text)
arr = tree.xpath('//table[#class="table table-hover table-bordered"]//td/a/#href')
return arr
def fetch_quote(url):
page = requests.get(url)
tree = html.fromstring(page.text)
quotes = tree.find_class('bqQt')
tempString = ""
for q in quotes:
tempString += ("\"%s\"," % next(q.find_class('b-qt')[0].iter('a')).text)
tempString += ("%s," % next(q.find_class('bq-aut')[0].iter('a')).text)
for element in q.find_class('oncl_k'):
tempString += "%s " % element.text
tempString += "\n"
return tempString
def pagination(url, htmlPage): # .html or not - htmlPage True or False
arr = []
page = requests.get(url)
tree = html.fromstring(page.text)
end = tree.xpath('//div[#class="row paginationContainer"]//nav//ul/li[last()-1]/a/text()')
if len(end):
if (htmlPage):
url = url.split('.html')[0]
for count in range(2, int(end[0]) + 1):
arr.append(url + "%s.html" % (count))
else:
for count in range(2, int(end[0]) + 1):
arr.append(url + "%s" % (count))
return arr
if __name__ == '__main__':
get_authors()

Simple page spider in Python - SQLite won't update

I have a very simple page spider that crawls for words on a given page and stores the count of the words in a SQLite data base. Although, the code exits with the exit code 0, the database won't update with any entries.
I don't know if I'm just snow blind or there's something inherently wrong with my code.
Here's the structure of the project and the code:
spider.py
input.txt
words.db
utilities (folder):
url_utilities.py
database_utilities.py
spider.py
import argparse
from utilities import url_utilities, database_utilities
def main(database: str, url_list_file: str):
big_word_list = []
urls = url_utilities.load_urls_from_file(url_list_file)
for url in urls:
print(f"Reading {url}")
page_content = url_utilities.load_page(url=url)
words = url_utilities.scrape_page(page_contents=page_content)
big_word_list.extend(words)
# database code
path = "C:\\Users\\baduker\\PycharmProjects\\page_spider\\words.db"
database_utilities.create_database(database_path=path)
database_utilities.save_words_to_database(database_path=path, words_list=big_word_list)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-db", "--database", help="SQLite File Name")
parser.add_argument("-i", "--input", help="File with urls")
args = parser.parse_args()
database_file = args.database
input_file = args.input
main(database=database_file, url_list_file=input_file)
url_utilities.py
import re
import string
from urllib.request import urlopen
from bs4 import BeautifulSoup
def load_urls_from_file(file_path: str):
try:
with open("input.txt") as f:
content = f.readlines()
return content
except FileNotFoundError:
print(f"The file {file_path} could not be found.")
exit(2)
def load_page(url: str):
response = urlopen(url)
html = response.read().decode("utf-8")
return html
def scrape_page(page_contents: str):
chicken_noodle = BeautifulSoup(page_contents, "html.parser")
for script in chicken_noodle(["script", "style"]):
script.extract()
text = chicken_noodle.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
plain_text = ''.join(filter(lambda x: x in string.printable, text))
clean_words = []
words = plain_text.split(" ")
for word in words:
clean = True
for punctuation_marks in string.punctuation:
if punctuation_marks in word:
clean = False
if any(char.isdigit() for char in word):
clean = False
# at least two characters but no more than 10
if len(word) < 2 or len(word) > 10:
clean = False
if not re.match(r'^\w+$', word):
clean = False
if clean:
try:
clean_words.append(word.lower())
except UnicodeEncodeError:
print(".")
return clean_words
database_utilities.py
import sqlite3 as lite
def create_database(database_path: str):
conn = lite.connect(database_path)
with conn:
cur = conn.cursor()
cur.execute("drop table if exists words")
ddl = "create table words (word text not null primary key, usage_count int default 1 not null);"
cur.execute(ddl)
ddl = "create unique index words_word_uindex on words (word);"
cur.execute(ddl)
conn.close()
def save_words_to_database(database_path: str, words_list: list):
conn = lite.connect(database_path)
with conn:
cur = conn.cursor()
for word in words_list:
sql = "select count(word) from words where word='" + word + "';"
cur.execute(sql)
count = cur.fetchone()[0]
if count > 0:
sql = "update words set usage_count = usage_count + 1 where word='" + word + "';"
else:
sql = "insert into words(word) values ('" + word + "');"
cur.execute(sql)
conn.commit()
conn.close()
print(f"Database save complete!")
input.txt
https://en.wikipedia.org/wiki/Python_(programming_language)
https://en.wikipedia.org/wiki/Guido_van_Rossum
https://en.wikipedia.org/wiki/Benevolent_dictator_for_life

Your code seems to be working.
I suspect you have permissions issue with the database file.
Make sure this line points to a folder where you have permissions to write:
path = "C:\\Users\\baduker\\PycharmProjects\\page_spider\\words.db"
or just remove the path and see if it works.
path = "words.db"

Ur context manager i.e. with
with con:
Hope u should commit before closing it. I mean u should commit in that with block itself.
U should do that in your database utility file.

Multithreading Scrape Html and Safely Save to One File

I want scrape the title from given url in multiple thread (example in 5 thread)
and save them to one text file. how to do it and how to make sure I safely save the output to one file?
this is my code:
import csv
import requests
requests.packages.urllib3.disable_warnings()
urls = []
with open('Input.csv') as csvDataFile:
csvReader = csv.reader(csvDataFile)
for row in csvReader:
urls.append(row[1])
def find_between( s, first, last ):
try:
start = s.index( first ) + len( first )
end = s.index( last, start )
return s[start:end]
except ValueError:
return ""
def get_title( url ):
try:
r = requests.get(url)
html_content = r.text.encode('UTF-8')
title = find_between(html_content , "<title>", "</title>")
return title
except:
return ""
for url in urls:
f = open('myfile.txt', 'a')
f.write(get_title(url) + '\n')
f.close()

try to use futures
1. create pool
2. sumbit function and parameters
3. get result from function
import csv
from concurrent import futures
pool = futures.ThreadPoolExecutor(5)
workers = [pool.sumbit(get_title,url) for url in urls]
while not all(worker.done() for worker in workers):
pass
with open(file) as f:
w = csv.writer(f)
w.writerows([[worker.result()] for worker in workers])

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Web scraping wordlists - python

Related

How to use threading to speed up my function?

full URL isn't being parsed correctly to a telegram bot

List ouf of Range Error when working with CSV files

Simple page spider in Python - SQLite won't update

Multithreading Scrape Html and Safely Save to One File

Categories

Resources