KeyError while crawling websites for keyword and status

KeyError while crawling websites for keyword and status - python

I'm currently trying to put two things together when checking multiple websites from my input CSV file:
Check HTTP status
Check if Website displays specific keyword
then save the results to a new CSV file.
My input.csv:
id url
1 https://example123.com
2 https://envato.com/blog/30-outstanding-coming-soon-and-under-construction-website-templates/
3 https://mundoshoponline.com
My Code:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import asyncio
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
df = pd.read_csv('path/to/my/input.csv')
#my csv has urls in the 1st column
urls = df.T.values.tolist()[1]
results = {}
status = []
async def scrape(url):
try:
r = requests.get(url, timeout=(3, 6))
r.raise_for_status()
soup = BeautifulSoup(r.content, 'html.parser')
#all keywords to check on the website
data = {
"coming soon": soup.body.findAll(text = re.compile("coming soon", re.I)),
"Opening Soon": soup.body.findAll(text = re.compile("Opening Soon", re.I)),
"Forbidden": soup.body.findAll(text = re.compile("Forbidden", re.I)),
"Page not found": soup.body.findAll(text = re.compile("Page not found", re.I)),
"Under Construction": soup.body.findAll(text = re.compile("Under Construction", re.I)),
"Currently Unavailable": soup.body.findAll(text = re.compile("Currently Unavailable", re.I))}
results[url] = data
#check for http status and save to status list
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
status.append("Down")
except requests.exceptions.HTTPError:
status.append("Other")
else:
status.append("OK")
async def main():
await asyncio.wait([scrape(url) for url in urls])
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
comingList= []
openingList = []
forbiddenList= []
notfoundList = []
underList = []
currentlyList = []
#mark x if there are any hits for specific keyword
for url in results:
comingList.append("x" if len(results[url]["coming soon"]) > 0 else "")
openingList.append("x" if len(results[url]["Opening Soon"]) > 0 else "")
forbiddenList.append("x" if len(results[url]["Forbidden"]) > 0 else "")
notfoundList.append("x" if len(results[url]["Page not found"]) > 0 else "")
underList.append("x" if len(results[url]["Under Construction"]) > 0 else "")
currentlyList.append("x" if len(results[url]["Currently Unavailable"]) > 0 else "")
df["comingSoon"] = pd.DataFrame(comingList, columns=['comingSoon'])
df["openingSoon"] = pd.DataFrame(openingList, columns=['openingSoon'])
df["forbidden"] = pd.DataFrame(forbiddenList, columns=['forbidden'])
df["notfound2"] = pd.DataFrame(notfoundList, columns=['notfound2'])
df["underConstruction"] = pd.DataFrame(underList, columns=['underConstruction'])
df["currentlyUnavailable"] = pd.DataFrame(currentlyList, columns=['currentlyUnavailable'])
df['status'] = status
print(df)
df.to_csv('path/to/my/output.csv', index=False)
However, whenever I run the above script with for url in urls:
for some of my urls it throws this error and script breaks and output.csv is not generated:
Traceback (most recent call last):
File "path/to/myscan.py", line 51, in <module>
comingList.append("x" if len(results[url]["coming soon"]) > 0 else "")
KeyError: 'http://example123.com'
and when running it with for url in results: output.csv is as follows:
[![enter image description here][1]][1]
seems erroneous as first row has keywords marked as present (comingSoon, underConstruction columns) + status column = Down. But website doesn't contain 'coming soon' or 'under construction' strings.
Would someone be able to help me with this? I believe there might be an issue in my loop or try/except part of the code. I'm happy to provide more information if the above is not sufficient. Thank you in advance.

I think your main problem is that you are iterating over the whole urls which some of which may have failed and therefore does not exist in your results as a key.
A much safer way to do this is to iterate over the subset of urls that you are sure have succeeded and have a key in results, so instead of
for url in urls:
you could make it
for url in results:
To make the final results consistent with the input order of your urls:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import asyncio
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
df = pd.read_csv('./input.csv')
#my csv has urls in the 4th column
urls = [ 'example123.com', 'https://envato.com/blog/30-outstanding-coming-soon-and-under-construction-website-templates/', 'http://alotechgear.com']
results = {}
status = {}
async def scrape(url):
try:
r = requests.get(url, timeout=(3, 6))
r.raise_for_status()
soup = BeautifulSoup(r.content, 'html.parser')
#all keywords to check on the website
data = {
"coming soon": soup.body.findAll(text = re.compile("coming soon", re.I)),
"Opening Soon": soup.body.findAll(text = re.compile("Opening Soon", re.I)),
"Forbidden": soup.body.findAll(text = re.compile("Forbidden", re.I)),
"Page not found": soup.body.findAll(text = re.compile("Page not found", re.I)),
"Under Construction": soup.body.findAll(text = re.compile("Under Construction", re.I)),
"Currently Unavailable": soup.body.findAll(text = re.compile("Currently Unavailable", re.I))}
results[url] = data
#check for http status and save to status list
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, requests.exceptions.MissingSchema):
status[url] = "Down"
except requests.exceptions.HTTPError:
status[url] = "Other"
else:
status[url] = "OK"
async def main():
await asyncio.wait([scrape(url) for url in urls])
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
comingList= []
openingList = []
forbiddenList= []
notfoundList = []
underList = []
currentlyList = []
statusList = []
#mark x if there are any hits for specific keyword
for url in urls:
if(not results.get(url)):
statusList.append(status.get(url))
notfoundList.append("x")
comingList.append("-")
openingList.append("-")
forbiddenList.append("-")
underList.append("-")
currentlyList.append("-")
else:
statusList.append(status.get(url))
comingList.append("x" if len(results[url].get("coming soon")) > 0 else "-")
openingList.append("x" if len(results[url].get("Opening Soon")) > 0 else "-")
forbiddenList.append("x" if len(results[url].get("Forbidden")) > 0 else "-")
notfoundList.append("x" if len(results[url].get("Page not found")) > 0 else "-")
underList.append("x" if len(results[url].get("Under Construction")) > 0 else "-")
currentlyList.append("x" if len(results[url].get("Currently Unavailable")) > 0 else "-")
df["comingSoon"] = pd.DataFrame(comingList, columns=['comingSoon'])
df["openingSoon"] = pd.DataFrame(openingList, columns=['openingSoon'])
df["forbidden"] = pd.DataFrame(forbiddenList, columns=['forbidden'])
df["notfound2"] = pd.DataFrame(notfoundList, columns=['notfound2'])
df["underConstruction"] = pd.DataFrame(underList, columns=['underConstruction'])
df["currentlyUnavailable"] = pd.DataFrame(currentlyList, columns=['currentlyUnavailable'])
df['status'] = pd.DataFrame(statusList, columns=['Status'])
print(df)
df.to_csv('./output.csv', index=False)
sample result:
id url comingSoon openingSoon forbidden notfound2 underConstruction currentlyUnavailable status
0 1 https://example123.com - - - x - - Down
1 2 https://envato.com/blog/30-outstanding-c... x - - - x - OK
2 3 https://mundoshoponline.com - - - x - - Down

Related

How to get URL from two dropdown lists (webscraping with python)

I want to webscrape this webpage (www.autocar.co.uk). Therefore, I want to select each car manufacturer in a drop down menu and the model to get the HREF/reference to the model website and then retrieve some information from each model page (not reflected in the code yet)
As I just started coding I would higly appreciate your input! Thanks in advance!! :)
Desired output:
https://www.autocar.co.uk/car-review/abarth/595
https://www.autocar.co.uk/car-review/abarth/595-competizione
https://www.autocar.co.uk/car-review/abarth/124-spider-2016-2019
https://www.autocar.co.uk/car-review/abarth/695-biposto-2015-2016
https://www.autocar.co.uk/car-review/ac-schnitzer/acs3-sport
https://www.autocar.co.uk/car-review/ac-schnitzer/acs1
https://www.autocar.co.uk/car-review/ac-schnitzer/acs5-sport
https://www.autocar.co.uk/car-review/allard/j2x-mkii
https://www.autocar.co.uk/car-review/alfa-romeo/giulia
https://www.autocar.co.uk/car-review/alfa-romeo/tonale
Output as of now --> we need to remove the "https://www.autocar.co.uk0":
https://www.autocar.co.uk0
https://www.autocar.co.uk/car-review/abarth/595
https://www.autocar.co.uk/car-review/abarth/595-competizione
https://www.autocar.co.uk/car-review/abarth/124-spider-2016-2019
https://www.autocar.co.uk/car-review/abarth/695-biposto-2015-2016
https://www.autocar.co.uk0
https://www.autocar.co.uk/car-review/ac-schnitzer/acs3-sport
https://www.autocar.co.uk/car-review/ac-schnitzer/acs1
https://www.autocar.co.uk/car-review/ac-schnitzer/acs5-sport
https://www.autocar.co.uk0
https://www.autocar.co.uk/car-review/allard/j2x-mkii
https://www.autocar.co.uk0
https://www.autocar.co.uk/car-review/alfa-romeo/giulia
https://www.autocar.co.uk/car-review/alfa-romeo/tonale
Code as of now:
from bs4 import BeautifulSoup
import requests
import pandas as pd
#Inputs/URLs to scrape:
url = "http://www.autocar.co.uk/"
s = requests.Session()
r = s.get(url)
soup = BeautifulSoup(r.text,'html.parser')
full_car_list = []
car_list = [(x.text, x.get("value"), f'https://www.autocar.co.uk/ajax/car-models/{x.get("value")}/0') for x in soup.select_one('#edit-make').select('option')]
for x in car_list:
r = s.get(x[2])
try:
for item in r.json()['options'].items():
#Car Model
car_model_url = (f'https://www.autocar.co.uk{item[0]}')
print(car_model_url)
except Exception as e:
full_car_list.append((x[0], 'no models', f'https://www.autocar.co.uk/vehicles/{x[0]}'))

You'll want to refactor things into a couple of functions for clarity; that also makes it easier to skip data that isn't valid (apparently occasionally you'd get a list from the ajax/car-models API):
from bs4 import BeautifulSoup
import requests
sess = requests.Session()
def get_make_info():
resp = sess.get("http://www.autocar.co.uk/")
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')
for option in soup.select('#edit-make option'):
make_id = option['value']
yield (make_id, option.text)
def get_make_models(make_id):
info_url = f'https://www.autocar.co.uk/ajax/car-models/{make_id}/0'
resp = sess.get(info_url)
resp.raise_for_status()
data = resp.json()
options = data['options']
if isinstance(options, list): # Invalid format, skip
return
for model_url, model_name in options.items():
if model_url == "0": # "All models"
continue
model_url = f'https://www.autocar.co.uk{model_url}'
yield (model_url, model_name)
for make_id, make_name in get_make_info():
for model_url, model_name in get_make_models(make_id):
print(make_id, make_name, model_url, model_name)

Using the code as written for your previous question, all you have to do is print out the 'Url' column of the dataframe:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "http://www.autocar.co.uk/"
s = requests.Session()
r = s.get(url)
soup = BeautifulSoup(r.text,'html.parser')
full_car_list = []
car_list = [(x.text, x.get("value"), f'https://www.autocar.co.uk/ajax/car-models/{x.get("value")}/0') for x in soup.select_one('#edit-make').select('option')]
for x in car_list:
r = s.get(x[2])
try:
for item in r.json()['options'].items():
full_car_list.append((x[0], item[1], f'https://www.autocar.co.uk{item[0]}'))
except Exception as e:
full_car_list.append((x[0], 'no models', f'https://www.autocar.co.uk/vehicles/{x[0]}'))
cars_df = pd.DataFrame(full_car_list[1:], columns = ['Make', 'Model', 'Url'])
cars_df = cars_df[cars_df.Model != 'All models']
cars_df.to_csv('makes_models.csv')
for x in cars_df.Url.tolist():
print(x)

How to output only relevant changes while scraping for new discounts?

In a previous question I got the answer from Hedgehog! (How to check for new discounts and send to telegram if changes detected?)
But another question is, how can I get only the new (products) items in the output and not all the text what is changed. My feeling is that the output I got is literally anything what is changed on the website and not only the new added discount.
Here is the code, and see the attachment what the output is. Thanks again for all the effort.
`# Import all necessary packages
import requests, time, difflib, os, re, schedule, cloudscraper
from bs4 import BeautifulSoup
from datetime import datetime
# Define scraper
scraper = cloudscraper.create_scraper()
# Send a message via a telegram bot
def telegram_bot_sendtext(bot_message):
bot_token = '1XXXXXXXXXXXXXXXXXXXXXXXXXXG5pses8'
bot_chatID = '-XXXXXXXXXXX'
send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + bot_chatID
+ '&parse_mode=Markdown&text=' + bot_message
response = requests.get(send_text)
return response.json()
PrevVersion = ""
FirstRun = True
while True:
# Download the page with the specified URL
response = scraper.get("https://").content
# Url for in the messages to show
url = "https://"
# Act like a browser
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# Parse the downloaded page and check for discount on the page
soup = BeautifulSoup(response, 'html.parser')
def get_discounts(soup):
for d in soup.select('.cept-discount'):
if d.text != '' and 65 < int(''.join(filter(str.isdigit, d.text))) < 99:
return True
else:
return False
# Remove all scripts and styles
for script in soup(["script", "style"]):
script.extract()
discounts = get_discounts(soup)
soup = soup.get_text()
# Compare the page text to the previous version and check if there are any discounts in your range
if PrevVersion != soup and discounts:
# On the first run - just memorize the page
if FirstRun == True:
PrevVersion = soup
FirstRun = False
print ("Start Monitoring "+url+ ""+ str(datetime.now()))
else:
print ("Changes detected at: "+ str(datetime.now()))
OldPage = PrevVersion.splitlines()
NewPage = soup.splitlines()
diff = difflib.context_diff(OldPage,NewPage,n=0)
out_text = "\n".join([ll.rstrip() for ll in '\n'.join(diff).splitlines() if ll.strip()])
print (out_text)
OldPage = NewPage
# Send a message with the telegram bot
telegram_bot_sendtext("Nieuwe prijsfout op Pepper " + url )
# print ('\n'.join(diff))
PrevVersion = soup
else:
print( "No Changes "+ str(datetime.now()))
time.sleep(5)
continue`

What happens?
As discussed, your assumptions are going in the right direction, all the changes identified by the difflib will be displayed.
It may be possible to adjust the content of difflib but I am sure that difflib is not absolutely necessary for this task.
How to fix?
First step is to upgrade get_discounts(soup) to not only check if discount is in range but also get information of the item itself, if you like to display or operate on later:
def get_discounts(soup):
discounts = []
for d in soup.select('.cept-discount'):
if d.text != '' and 65 < int(''.join(filter(str.isdigit, d.text))) < 99:
discounts.append({
'name':d.find_previous('strong').a.get('title'),
'url':d.find_previous('strong').a.get('href'),
'discount':d.text,
'price':d.parent.parent.select_one('.thread-price').text,
'bestprice':d.previous_sibling.text
})
return discounts
Second step is to check if there is a new discount, close to the difflib but more focused:
def compare_discounts(d1: list, d2: list):
diff = [i for i in d1 + d2 if i not in d1]
result = len(diff) == 0
if not result:
return diff
Last step is to react to changes from the discounts, if so it will print the urls from so you can go directly to the offert products.
Note Cause we have stored additional information in our list of dicts you can adjust the printing to get also the whole information or specific attributes
if newDiscounts:
#Send a message with the telegram bot
print('\n'.join([c['url'] for c in newDiscounts]))
telegram_bot_sendtext("Nieuwe prijsfout op Pepper " + url)
Example
import requests, time, difflib, os, re, schedule, cloudscraper
from bs4 import BeautifulSoup
from datetime import datetime
# Define scraper
scraper = cloudscraper.create_scraper()
# Send a message via a telegram bot
def telegram_bot_sendtext(bot_message):
bot_token = '1XXXXXXXXXXXXXXXXXXXXXXXXXXG5pses8'
bot_chatID = '-XXXXXXXXXXX'
send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + bot_chatID + '&parse_mode=Markdown&text=' + bot_message
response = requests.get(send_text)
return response.json()
PrevVersion = ""
PrevDiscounts = []
FirstRun = True
def get_discounts(soup):
discounts = []
for d in soup.select('.cept-discount'):
if d.text != '' and 65 < int(''.join(filter(str.isdigit, d.text))) < 99:
discounts.append({
'name':d.find_previous('strong').a.get('title'),
'url':d.find_previous('strong').a.get('href'),
'discount':d.text,
'price':d.parent.parent.select_one('.thread-price').text,
'bestprice':d.previous_sibling.text
})
return discounts
def compare_discounts(d1: list, d2: list):
diff = [i for i in d1 + d2 if i not in d1]
result = len(diff) == 0
if not result:
return diff
while True:
# Download the page with the specified URL
response = requests.get("https://nl.pepper.com/nieuw").content
# Url for in the messages to show
url = "https://nl.pepper.com/nieuw"
# Parse the downloaded page and check for discount on the page
soup = BeautifulSoup(response, 'html.parser')
# Remove all scripts and styles
for script in soup(["script", "style"]):
script.extract()
discounts = get_discounts(soup)
souptext = soup.get_text()
# Compare the page text to the previous version and check if there are any discounts in your range
if PrevVersion != souptext and discounts:
# On the first run - just memorize the page
if FirstRun == True:
PrevVersion = souptext
PrevDiscounts = discounts
FirstRun = False
print ("Start Monitoring "+url+ ""+ str(datetime.now()))
else:
print ("Changes detected at: "+ str(datetime.now()))
newDiscounts = compare_discounts(PrevDiscounts,discounts)
if newDiscounts:
print('\n'.join([c['url'] for c in newDiscounts]))
#Send a message with the telegram bot
telegram_bot_sendtext("Nieuwe prijsfout op Pepper " + url)
else:
print('These are general changes but there are no new discounts available.')
PrevVersion = souptext
PrevDiscounts = discounts
else:
print( "No Changes "+ str(datetime.now()))
time.sleep(10)
continue
Output
Start Monitoring https://nl.pepper.com/nieuw 2021-12-12 12:28:38.391028
No Changes 2021-12-12 12:28:54.009881
Changes detected at: 2021-12-12 12:29:04.429961
https://nl.pepper.com/aanbiedingen/gigaset-plug-startpakket-221003
No Changes 2021-12-12 12:29:14.698933
No Changes 2021-12-12 12:29:24.985394
No Changes 2021-12-12 12:29:35.271794
No Changes 2021-12-12 12:29:45.629790
No Changes 2021-12-12 12:29:55.917246
Changes detected at: 2021-12-12 12:30:06.184814
These are general changes but there are no new discounts available.

I have been trying to create a csv file from data recieved from a web scraper

as of right now i have a working code which is a web scraper that logs into indeed job search site. My issue now is tha I need to create a csv file that shows every single job position that was found, it gives me the numer of positions available and the description of one of them. Hope i can get some help, I would greatly apreciate it.
import re
import csv
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
jk_pattern = re.compile(r"jk:\'([a-zA-Z0-9]+)'")
params = { "q": "mechanical+engineer", "l": "united+states", "start": 0 }
url = "https://www.indeed.com/jobs"
job_keys = set()
for x in range(10):
response = requests.get(url, params=params)
if not response.status_code == 200:
break
else:
keys = jk_pattern.findall(response.text)
if len(keys) > 0:
for key in keys:
job_keys.add(key)
params['start'] += 20
sleep(randint(0, 3))
len(job_keys)
template = "https://www.indeed.com/viewjob?jk={}"
jk = job_keys.pop()
job_url = template.format(jk)
response = requests.get(job_url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.find("div", id="jobDescriptionText").text)
def get_record(card):
"""Extract job data from a single record"""
job_title = card.h2.a.get('title')
company = card.find('span', 'company').text.strip()
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')
summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
job_url = 'https://www.indeed.com' + card.h2.a.get('href')
# this does not exists for all jobs, so handle the exceptions
salary_tag = card.find('span', 'salaryText')
if salary_tag:
salary = salary_tag.text.strip()
else:
salary = ''
record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
return record
def main(position, location):
"""Run the main program routine"""
records = []
url = get_url(position, location)
# extract the job data
while True:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('div', 'jobsearch-SerpJobCard')
for card in cards:
record = get_record(card)
records.append(record)
try:
url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
except AttributeError:
break
# save the job data
with open('results.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
writer.writerows(records)

Finding when a webpage updates via Python?

So, I am scraping a webpage and I have a element on the page where it displays an integer, when I scrape that element, i store the plaintext in a variable, then each time it scrapes, i compare the variable to what the plaintext is on the webpage. I am not sure if maybe i need to get a request to the webpage each time?
from win10toast import ToastNotifier
from _overlapped import NULL
from plyer import notification
import requests
from bs4 import BeautifulSoup
toaster = ToastNotifier()
toaster.show_toast("Notification!", "Alert!", threaded=True, icon_path=NULL, duration=3)
URL = "https://rocketleague.tracker.network/rocket-league/profile/steam/76561198074072333/mmr?playlist=13"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
_title = ""
_message = ""
recent_mmr = "111"
def get_mmr(url):
results = soup.find_all(class_="stat")
for stat in results :
titles = stat.find_all(class_="label")
for t in titles :
if(t.text.strip() == "Rating"):
val = stat.find(class_="value").text.strip()
return val
def get_rank(url):
results = soup.find(class_="stat tier")
rank = results.find(class_="label")
return rank.text.strip()
_message = "Rank: " + get_rank(URL) + "\n" + "MMR: " + get_mmr(URL)
recent_mmr = get_mmr(URL)
import time
while toaster.notification_active():
time.sleep(0.1)
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
print(recent_mmr)
recent_mmr = get_mmr(URL)
while True:
print('running')
#page = requests.get(URL)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print(mmr_temp +"(temp mmr)")
if mmr_temp == recent_mmr:
print("No update, recent MMR: " + recent_mmr)
mmr_temp = recent_mmr
time.sleep(60)
else:
notification.notify(
title="Ranked 3v3",
message= _message,
app_icon=NULL,
timeout=10
)
time.sleep(60)
recent_mmr = get_mmr(URL)
mmr_temp = recent_mmr
print("Updated, recent MMR: " + recent_mmr)

You're scraping the webpage to get the recent_mmr number, copying that to mmr_temp, and then immediately comparing to see if they're equal -- well of course they are, because you just copied it!
You need to reorganize the loop a little bit, and copy the mmr variable at the bottom of the loop:
previous_mmr = None
while True:
recent_mmr = get_mmr()
if recent_mmr != previous_mmr:
print("mmr changed")
previous_mmr = recent_mmr

While running python code program flow gets stuck in try block

Python code get stuck in the try block
`
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
#import urllib2
def url1_to_string(url1):
html=""
proxyDict = {
'http': 'http://username:pwd#proxyurl:8080',
'https': 'https://username:pwd#proxyurl:8080'
}
try:
print('Before res in try')
res = requests.get(url1,proxies=proxyDict)
print('After res in try')
except:
pass
html = res.text
soup = BeautifulSoup(html, 'html5lib')
for script in soup(["script", "style", 'aside']):
script.extract()
return " ".join(re.split(r'[\n\t]+', soup.get_text()))
df=pd.read_csv(r'C:\filepath\abc.csv',encoding='latin-1')
anchor_count = []
account_count = []
aggregate_page_count=[]
agg_url_count=[]
for index, row in df.iterrows():
agg_url_list = []
ini_url="http://www.google.com/search?q="+row['ANCHOR_NAME']+" AND "+row['ACCOUNT_NAME']
r = requests.get(ini_url,proxies={"http":"http://one.proxy.att.com:8080"})
ny_bb1 = url1_to_string(ini_url)
anchor_count.append(ny_bb1.lower().count(row['ANCHOR_NAME'].lower()))
account_count.append(ny_bb1.lower().count(row['ACCOUNT_NAME'].lower()))
print(anchor_count)
soup = BeautifulSoup(r.text,"html.parser")
get_details1 = soup.find_all("div", attrs={"class": "g"})
sublist1 = []
for details1 in get_details1:
link1 = details1.find_all("h3")
for mdetails1 in link1[:]:
links1 = mdetails1.find_all("a")
lmk1 = ""
for lnk1 in links1[:]:
lmk1 = lnk1.get("href")[7:].split("&")
sublist1.append(lmk1[0])
aggregate_count1=0
for x1 in sublist1[:3]:
anchorcount1=0
accountcount1=0
print("aagg url",x1)
try:
print('In try block')
ny_bb1 = url1_to_string(x1)
except KeyboardInterrupt: print('You cancelled the operation.')
finally:
pass
ny_bb1=ny_bb1.upper()
print(ny_bb1)
row['ANCHOR_NAME']=row['ANCHOR_NAME'].upper()
row['ACCOUNT_NAME']=row['ACCOUNT_NAME'].upper()
anchor_name=re.match(r'\W*(\w[^,. !?"]*)', row['ANCHOR_NAME']).groups()[0]
account_name=re.match(r'\W*(\w[^,. !?"]*)', row['ACCOUNT_NAME']).groups()[0]
if(anchor_name==account_name):
if(row['ANCHOR_NAME'] in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(row['ACCOUNT_NAME'] in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
else:
if (anchor_name in ny_bb1.upper()):
anchorcount1 = anchorcount1 + 1
if(account_name in ny_bb1.upper()):
accountcount1 = accountcount1 + 1
if(anchorcount1 > 0 and accountcount1 > 0):
aggregate_count1=aggregate_count1+1
agg_url_list.append(x1[:])
print("existance of both",aggregate_count1)
aggregate_page_count.append(aggregate_count1)
agg_url_count.append(agg_url_list)
df['anc_cnt']=pd.Series(anchor_count)
df['acc_cnt']=pd.Series(account_count)
df['agg_cnt']=pd.Series(aggregate_page_count)
df['agg_url_list']=pd.Series(agg_url_count)
`
The contents of the abc.csv file as follows ::
ANCHOR_NAME,ACCOUNT_NAME
ABC,ABC
XYZ,ZYZ
and so on
For particular URL's the code gets stuck in the try block and control does not come to except block where I want to ignore the exception and continue with normal program flow, as executing the next URL's and so on.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

KeyError while crawling websites for keyword and status - python

Related

How to get URL from two dropdown lists (webscraping with python)

How to output only relevant changes while scraping for new discounts?

I have been trying to create a csv file from data recieved from a web scraper

Finding when a webpage updates via Python?

While running python code program flow gets stuck in try block

Categories

Resources