I'm making a wikipedia crawler but it's very slow. How can I make it faster?
I'm using requests module and beautifulsoup4 to parse the html pages. I've tried implementing multithreading but it's still slow.
import requests
from bs4 import BeautifulSoup as bs
from queue import Queue
baseURL = "https://en.wikipedia.org";
startURL = "/wiki/French_battleship_Courbet_(1911)"
endURL = "/wiki/Royal_Navy"
tovisit = Queue()
visited = []
def main():
if (not checkValid(startURL)) or (not checkValid(endURL)):
print("Invalid URLs entered.")
quit()
initCrawler(startURL)
def initCrawler(startURL):
global tovisit
global visited
tovisit.put(startURL)
finished = False
while not finished:
if tovisit.empty():
finished = True
continue
url = tovisit.get()
childlinks = linkCrawl(url)
for i in childlinks:
tovisit.put(i)
visited.append(url)
def linkCrawl(url):
global visited
global tovisit
global endURL
print("crawling "+ url + "\n")
r = requests.get(baseURL+url)
soup = bs(r.content, "html.parser")
rawlinks = soup.find_all('a', href=True)
refinedlinks = []
for rawLink in rawlinks:
i = rawLink["href"]
if i is None:
continue
# ensure what we have is a string
if not (type(i) is str):
continue
# no poi
if i in visited:
continue
if i in list(tovisit.queue):
continue
if not checkValid(i):
continue
if i == endURL:
print("yay")
exit()
refinedlinks.append(i)
return refinedlinks
def checkValid(url):
if not url.startswith("/wiki/"):
return False
if url.startswith("/wiki/Special:"):
return False
if url.startswith("/wiki/Wikipedia:"):
return False
if url.startswith("/wiki/Portal:"):
return False
if url.startswith("/wiki/File:"):
return False
if url.endswith("(disambiguation)"):
return False
return True
if __name__ == "__main__":
main()
I expect the bot to run faster, but it's actually slow. Research says that eventually multithreading won't be enough.
Related
I have a small script that I have written (see below) which fetches JSON data from a web url. The goal is to print out any new data in the JSON. Is there any way I can continuously check the URL every 5 seconds and report back any changes? I am sure I am not doing it right, but what i have tried is creating a first list from the JSON object items, waiting 5 seconds creating a second list and then comparing the two. This is obviously not the way to do it because I still have to run the script myself each time. I just want to run the script once have it kind of 'listen' or 'poll' the URL and throw back any changes in data. My code is below, any assistance is greatly appreciated and any other optimizations you would suggest for my script.
import json, requests
import time
urls=["https://api.coingecko.com/api/v3/coins/markets?vs_currency=usd&order=market_cap_desc&per_page=250&page=1&sparkline=false", "https://api.coingecko.com/api/v3/coins/markets?vs_currency=usd&order=market_cap_desc&per_page=250&page=2&sparkline=false"]
def get_data(url):
url = requests.get(url)
text = url.text
data = json.loads(text)
coins = [coin['id'] for coin in data]
return coins
def check_new_coins():
first_list = get_data(url)
time.sleep(5)
second_list= get_data(url)
new_coins = list(set(second_list).difference(first_list))
if len(new_coins) > 0:
for new_coin in new_coins:
print(new_coin)
else:
print("No new coins")
for url in urls:
check_new_coins()
I think that's what you're looking for
import json, requests
import time
class Checker:
def __init__(self, urls, wait_time):
self.wait_time = wait_time
self.urls = urls
self.coins = self.get_coins()
self.main_loop()
#staticmethod
def get_data(url):
url = requests.get(url)
text = url.text
data = json.loads(text)
coins = [coin['id'] for coin in data]
return coins
def get_coins(self):
coins = set()
for url in self.urls:
coins.update(Checker.get_data(url))
return coins
def check_new_coins(self):
new_coins = self.get_coins()
coins_diff = list(new_coins.difference(self.coins))
current_time = time.strftime("%H:%M:%S", time.localtime())
if len(coins_diff) > 0:
print(current_time, coins_diff)
else:
print(current_time, "No new coins")
self.coins = new_coins
def main_loop(self):
while True:
time.sleep(self.wait_time)
self.check_new_coins()
if __name__ == '__main__':
urls=[
"https://api.coingecko.com/api/v3/coins/markets?vs_currency=usd&order=market_cap_desc&per_page=250&page=1&sparkline=false",
"https://api.coingecko.com/api/v3/coins/markets?vs_currency=usd&order=market_cap_desc&per_page=250&page=2&sparkline=false"
]
Checker(urls, 5)
sample output:
18:57:20 No new coins
18:57:25 No new coins
18:57:30 No new coins
18:57:35 No new coins
18:57:41 No new coins
18:57:46 No new coins
18:57:51 No new coins
18:57:56 No new coins
I'm doing some web scraping from yahoo finance (NVIDIA Stocks) and I'm wondering why when i run my code i get always the same value, but in my browser when i refresh the page i get differents values (as it should be), how can i fix it?
import requests
from datetime import datetime
import time
def Is_Number(string):
try:
int(string)
return True
except:
if(string == '.'):
return True
else:
return False
session = requests.Session()
for i in range(10):
Response = session.get("https://finance.yahoo.com/quote/NVDA?p=NVDA")
KeyWord = 'data-pricehint'
Index = Response.text.find(KeyWord) + 26
GoOn = True
CurrentPrice = ""
while(GoOn == True):
if ( Is_Number(Response.text[Index])):
CurrentPrice = CurrentPrice + Response.text[Index]
Index = Index + 1
else:
GoOn = False
CurrentTime = datetime.now().strftime('%H:%M:%S')
print("# Price:",CurrentPrice,"at",CurrentTime)
time.sleep(10)
Why don't you try yfinance instead?:
pip install yfinance
import yfinance as yf
import time
def get_price() -> float:
return yf.Ticker("NVDA").info.get("regularMarketPrice")
def run():
for i in range(10):
print(f"{i}: {get_price()}")
time.sleep(10)
if __name__ == '__main__':
run()
I'm very new to python, but I've made a lot of progress over the last few days. The below script works fine, but I just can't figure out how implement code that would print an incremented number every time 'avail' is equal to NO. I'd like to have it print something like 'None Available 1' on the first loop, then 'None Available 2' on the second loop, then 'None Available 3' on the third loop, etc..
import requests
import time
import subprocess
from bs4 import BeautifulSoup
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
avail = soup.find('span', id='availability').text.strip()
if avail == "YES":
return True
elif avail == "NO":
print('None Available')
return False
else:
print("Unexpected value")
return None
def main():
url ='https://www.blahblah.com'
while True:
is_available = get_detail_data(get_page(url))
if is_available:
subprocess.call(["C:\\temp\\filename.bat"], shell=False)
break
time.sleep(2)
if __name__ == '__main__':
main()
The following would probably work, but there might be a better way to structure it.
_not_avail_counter = 0
def get_detail_data(soup):
avail = soup.find('span', id='availability').text.strip()
if avail == "YES":
return True
elif avail == "NO":
_not_avail_counter += 1
print('None Available ' + str(_not_avail_counter))
return False
else:
print("Unexpected value")
return None
I would suggest changing your while True loop into a for loop on an itertools.count iterator. You can pass the value from the count to the get_detail_data function with an argument.
import itertools
def get_detail_data(soup, count): # take the count as an argument
avail = soup.find('span', id='availability').text.strip()
if ...
# ...
elif avail == "NO":
print('None Available', count) # include count here (and anywhere else you want)
# ...
def main():
url ='https://www.blahblah.com'
for c in itertools.count(): # produce the count in a loop
is_available = get_detail_data(get_page(url), c)
# ...
Note that itertools.count starts counting a zero. If you want to start at 1 (like a human usually would when counting things), you may want to pass 1 as the start argument: for c in itertools.count(1).
So what I want to do is that I want to make a sorts of monitor a website that pick ups a random number. Before it does it needs to requests to the website to see whenever it is valid or not. When it is live it will generate random numbers 1-100 and I want it to check every random 3-6 second and then print again the number and repeat until the website is down.
What I have tried to do is following:
def get_product_feed(url_site):
thread = url_site
password = False #We start by giving a false/true value
while not password: #If it is not True then we do the for loop
available = False
while not available:
try:
checkpass = requests.get(thread, timeout=12) #We requests the site to see if its alive or not
if ('deadsite' in checkpass.url): #If it contains etc deadsite then we enter the while True
while True:
contcheck = requests.get(thread,timeout=12) #We make new requests to see if its dead.
if ('deadsite' in contcheck.url): #if it is, then we sleep 3-7sec and try again
randomtime = random.randint(3, 7)
time.sleep(randomtime)
else: #If its not containig it anymore then we send the bs4 value
available = True
bs4 = soup(contcheck.text, 'lxml')
return bs4
break
else: #If its having either of them then we send instant.
bs4 = soup(contcheck.text, 'lxml')
return bs4
break
except Exception as err:
randomtime = random.randint(1, 2)
time.sleep(randomtime)
continue
def get_info(thread):
while True:
try:
url = thread
resp = requests.get(url, timeout=12) #We requests to the website here
resp.raise_for_status()
json_resp = resp.json() #We grab the json value.
except Exception as err:
randomtime = random.randint(1,3)
time.sleep(randomtime)
continue
metadata = json_resp['number'] #We return the metadata value back to get_identifier
return metadata
def get_identifier(thread):
new = get_info(thread) #We requests the get_info(thread):
try:
thread_number = new
except KeyError:
thread_number = None
identifier = ('{}').format(thread_number) #We return back to script
return identifier
def script():
url_site = 'https://www.randomsitenumbergenerator.com/' #What url we gonna use
old_list = []
while True:
for thread in get_product_feed(url_site): #We loop to see through get_product_feed if its alive or not
if get_identifier(thread) not in old_list: # We then ask get_identifier(thread) for the values and see if its in the old_list or not.
print(get_identifier(thread)
old_list.append(get_identifier(thread)
I added a comment to make it easier to understand what is going on.
The issue I am having now that I am not able to make get_identifier(thread) to run until the website is down and I want it to continue to print out until the website is live til it dies and that is my question! What do I need to do to make it happen?
My thoughts was to add eventually threads maybe that 10 threads are checking at the same time to see if the website is dead or not and give back the value as a print but I am not sure if that is my solution for the question.
so I want to check a website to update me whenever there's a new item posted. They don't update often so I'm fairly certain when they do update it will be the item of interest. I want to achieve this by choosing a "starting number" and count the number of links on the page then compare that number to the number of links every 10 minutes, until the number of links are greater than the starting number.
First I run this to get the "starting number" of links:
links=[]
for link in soup.findAll('a'):
links.append(link.get('href'))
start_num = len(links)
Then compare that number to the number of links right now and every 5 seconds:
notify=True
while notify:
try:
page = urllib.request.urlopen('web/site/url')
soup = bs(page, "lxml")
links=[]
for link in soup.findAll('a'):
links.append(link.get('href'))
if len(links) > start_num:
message = client.messages.create(to="", from_="",body="")
print('notified')
notify=False
else:
print('keep going')
time.sleep(60*5)
except:
print("Going to sleep")
time.sleep(60*10)
How can I combine all this into 1 function where I run can store the starting number of links without overwriting it every time I check it against the current number of links?
you can do it at least two ways: decorators and generators
Decorators:
def hang_on(func):
# soup should be in a visible scope
def count_links():
# refresh page?
return len(soup.findAll('a'))
start_num = count_links()
def wrapper(*args, **kwargs):
while True:
try:
new_links = count_links()
if new_links > start_num:
start_num = new_links
return fund(*args, **kwargs)
print('keep going')
time.sleep(60*5)
except:
print("Going to sleep")
time.sleep(60*10)
return wrapper
#hang_on
def notify():
message = client.messages.create(to="", from_="",body="")
print('notified')
# somewhere in your code, simply:
notify()
Generators:
def gen_example(soup):
# initialize soup (perhaps from url)
# soup should be in a visible scope
def count_links():
# refresh page?
return len(soup.findAll('a'))
start_num = count_links()
while True:
try:
new_links = count_links()
if new_links > start_num:
start_num = new_links
message = client.messages.create(to="", from_="",body="")
print('notified')
yield True # this is what makes this func a generator
print('keep going')
time.sleep(60*5)
except:
print("Going to sleep")
time.sleep(60*10)
# somewhere in your code:
gen = gen_example(soup) # initialize
gen.next() # will wait and notify
# coming soon
I would implement it as a class, because this code is quite readable and easy to support. Enjoy:
class Notifier:
url = 'web/site/url'
timeout = 60 * 10
def __links_count(self):
page = urllib.request.urlopen(self.url)
soup = bs(page, "lxml")
links=[]
for link in soup.findAll('a'):
links.append(link.get('href'))
return len(links)
def __notify(self):
client.messages.create(to="", from_="", body="")
print('notified')
def run(self):
current_count = self.__links_count()
while True:
try:
new_count = self.__links_count()
if new_count > current_count:
self.__notify()
break
sleep(self.timeout)
except:
print('Keep going')
sleep(self.timeout)
notifier = Norifier()
notifier.run()