I've written a script in python to scrape the result populated upon filling in two inputboxes zipcode and distance with 66109,10000. When I try the inputs manually, the site does display results but when I try the same using the script I get nothing. The script throws no error either. What might be the issues here?
Website link
I've tried with:
import requests
from bs4 import BeautifulSoup
url = 'https://www.sart.org/clinic-pages/find-a-clinic/'
payload = {
'zip': '66109',
'strdistance': '10000',
'SelectedState': 'Select State or Region'
}
def get_clinics(link):
session = requests.Session()
response = session.post(link,data=payload,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(response.text,"lxml")
item = soup.select_one(".clinics__search-meta").text
print(item)
if __name__ == '__main__':
get_clinics(url)
I'm only after this line Within 10000 miles of 66109 there are 383 clinics. generated when the search is made.
I changed the url and the requests method to GET and worked for me
def get_clinics(link):
session = requests.Session()
response = session.get(link, headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(response.text,"lxml")
item = soup.select_one(".clinics__search-meta").text
print(item)
url = 'https://www.sart.org/clinic-pages/find-a-clinic?zip=66109&strdistance=10000&SelectedState=Select+State+or+Region'
get_clinics(url)
Include cookies is one of the main concern here. If you do it in the right way, you can get a valid response following the way you started. Here is the working code:
import requests
from bs4 import BeautifulSoup
url = 'https://www.sart.org/clinic-pages/find-a-clinic/'
payload = {
'zip': '66109',
'strdistance': '10000',
'SelectedState': 'Select State or Region'
}
def get_clinics(link):
with requests.Session() as s:
res = s.get(link)
req = s.post(link,data=payload,cookies=res.cookies.get_dict())
soup = BeautifulSoup(req.text,"lxml")
item = soup.select_one(".clinics__search-meta").get_text(strip=True)
print(item)
if __name__ == '__main__':
get_clinics(url)
Related
I am trying to obtain both the title and link in some form of list or dictionary of the search results for the first page. However, my output shows Element 'a' href= before the link and my title shows Element 'h3' class=('LC20lb', 'MBeuO', 'DKV0Md') instead of the actual title. I have read through many examples but most offer API subscriptions that aren't preferable, nor does the beautifulsoup methods that i have searched for works either. This is the furthest I've got with this project so far. My code is below:
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
def get_source(url):
"""Return the source code for the provided URL.
Args:
url (string): URL of the page to scrape.
Returns:
response (object): HTTP response object from requests_html.
"""
try:
session = HTMLSession()
response = session.get(url)
return response
except requests.exceptions.RequestException as e:
print(e)
def scrape_google(query):
query = urllib.parse.quote_plus(query)
response = get_source("https://www.google.co.uk/search?q=" + query)
links = list(response.html.absolute_links)
google_domains = ('https://www.google.',
'https://google.',
'https://webcache.googleusercontent.',
'http://webcache.googleusercontent.',
'https://policies.google.',
'https://support.google.',
'https://maps.google.')
for url in links[:]:
if url.startswith(google_domains):
links.remove(url)
return links
def get_results(query):
query = urllib.parse.quote_plus(query)
response = get_source("https://www.google.co.uk/search?q=" + query)
return response
def parse_results(response):
css_identifier_result = ".MjjYud"
css_identifier_title = "h3.LC20lb.MBeuO.DKV0Md"
css_identifier_link = ".yuRUbf a"
results = response.html.find(css_identifier_result)
output = []
for result in results:
item = {
'title': result.find(css_identifier_title, first=True),
'link': result.find(css_identifier_link, first=True)
}
output.append(item)
return output
def google_search(query):
response = get_results(query)
return parse_results(response)
results = google_search("Elon Musk twitter")
results
Output:
[{'title': <Element 'h3' class=('LC20lb', 'MBeuO', 'DKV0Md')>,
'link': <Element 'a' href='https://www.theguardian.com/technology/2022/oct/30/twitter-trolls-bombard-platform-after-elon-musk-takeover' data-jsarwt='1' data-usg='AOvVaw3lmUr0p6yL70Nhr1Y5jurH' data-ved='2ahUKEwjA9-TFw4j7AhX3QzABHVyyBwcQFnoECBgQAQ'>}]
You can use BeautifulSoop library to parse your library better
I'm trying to get the destination of a bunch of t.co links from Twitter. I can get this for active links, but when they are 404 or dead links, the program dies. If I enter this into the browser, it shows me the destination URL.
Is there a way to do this in Python 3?
This is my existing code:
import requests
import pandas as pd
from requests.models import Response
# Loading my array of links
data = pd.read_json('tco-links.json')
links = pd.DataFrame(data)
output = []
session = requests.Session() # so connections are recycled
with open('output.json', 'w') as f:
for index, row in links.iterrows():
fullLink = 'http://' + row['link']
try:
response = session.head(fullLink, allow_redirects=True)
except:
# how I'm handling errors right now
response = Response()
response.url = 'Failed'
output.append({
'link': fullLink,
'id': row['id'],
'unshortened': response.url
})
for x in output:
f.write(json.dumps(x) + '\n')
G'day guys, I'm working on a python project that pulls weather data from BOM (https://bom.gov.au).
The script works correctly, however I would like for it to be able to use part of the URL within the post request. i.e., the user navigates to https://example.com/taf/ymml, the script runs and uses YMML within the POST.
the script I am using is below. I would like to swap out 'YSSY' in myobj for something that pulls it from the url that the user navigates to.
import requests
import re
url = 'http://www.bom.gov.au/aviation/php/process.php'
myobj = {'keyword': 'YSSY', 'type': 'search', 'page': 'TAF'}
headers = {'User-Agent': 'Chrome/102.0.0.0'}
x = requests.post(url, data = myobj, headers=headers)
content = x.text
stripped = re.sub('<[^<]+?>', ' ', content)
split_string = stripped.split("METAR", 1)
substring = split_string[0]
print(substring)
Any ideas?
Ok so I've managed to get this working using fastapi. When a user navigates to example.com/taf/ymml, the site will return in plain text the taf for ymml. it can be substituted for any Australian Aerodrome. One thing I haven't figured out is how to remove the the square brackets around the taf, but that is a problem for another time.
from fastapi import FastAPI
import requests
from bs4 import BeautifulSoup
app = FastAPI()
#app.get("/taf/{icao}")
async def read_icao(icao):
url = 'http://www.bom.gov.au/aviation/php/process.php'
myobj = {'keyword': icao, 'type': 'search', 'page': 'TAF'}
headers = {'User-Agent': 'Chrome/102.0.0.0'}
x = requests.post(url, data = myobj, headers=headers)
content = x.text
split_string = content.split("METAR", 1)
substring = split_string[0]
soup = BeautifulSoup(substring, 'html.parser')
for br in soup('br'):
br.replace_with(' ')
#Create TAFs array.
tafs = []
for taf in soup.find_all('p', class_="product"):
full_taf = taf.get_text()
tafs.append(full_taf.rstrip())
return {tuple(tafs)}
I'm looking for an opportunity to optimize my Python code for a multithreaded Python application.
My code works in the following way: it downloads sitemap, gathering all links from it into map_links. After that, the Parser function checks on every link found and gathers data on the tag I need.
import threading
import requests
from bs4 import BeautifulSoup as bs
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
base_url = 'https://example.com/page.php'
sitemap_url = 'https://example.com/sitemap.xml' #https://exgfmeetworld.xyz/sitemap.xml
# ф-ция парсинга карты сайта
def sitemap(sitemap_url,headers):
map_links =[]
session = requests.Session()
request =session.get(sitemap_url,headers=headers)
if request.status_code == 200:
soup=bs(request.content, 'xml')
for links in soup.find_all('loc'):
map_links.append(links.text)
return map_links
# главная ф-ция парсинга
def parser(base_url,headers):
session = requests.Session()
request =session.get(base_url,headers=headers)
if request.status_code == 200:
soup=bs(request.content, 'html.parser')
#keyword = soup.find_all('h1', attrs={'class':'vedaky'})
keyword = soup.select('h1')[0].get_text()
else:
print ('error')
pass
return keyword
# главная функция парсинга
def main():
all_links=sitemap (sitemap_url,headers)
for i in all_links:
keyword_pars = parser(i,headers)
print (keyword_pars)
if _name_ == '__main__':
main()
I have tried "multiprocessing import Pool" but it doesn't work for my purpose. I need non-pool decisions because I need much higher performance from the script. I'm planning to use it in more than 20 threads.
I can't test it without the proper links but I think this will do what you want it to. It works by passing a list to the parser function, which is of course passed by reference, then the output is 'saved' to an index of the list.
Note that I haven't added any much needed error handling.
import threading
import requests
from bs4 import BeautifulSoup as bs
from multiprocessing.dummy import Pool as ThreadPool
SITE_MAP_URL = 'https://exgfmeetworld.xyz/sitemap.xml'
BASE_URL = 'https://example.com/page.php'
HEADERS = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
def get_site_map() -> list:
request = requests.get(SITE_MAP_URL, headers=HEADERS, timeout=5)
links = []
if request.status_code == 200:
soup = bs(request.content, "html.parser")
links = [l.text for l in soup.find_all("loc")]
return links
def parser(link: str):
request = requests.get(link, headers=HEADERS, timeout=5)
if request.status_code == 200:
soup = bs(request.content, "html.parser")
return soup.find("h1").text
return None
# - MAIN
links = get_site_map()
parser_output = []
pool = ThreadPool(20)
results = pool.map(parser, links)
pool.close()
pool.join()
print(results)
I want to scrape data from a website, but first I want to get the page with pagination. Here I'm using python as a program language, and I already got this code. But when I run it, it doesn't work properly. the result must be stopped when response.url didn't match with expected_url. Is there someone know how to solve it? Please help, thank you.
Here is the code :
from bs4 import BeautifulSoup
import urllib.request
count = 0
url = "http://www.belanjamimo.net/foundation-bb-cream/?o=a&s=%d"
def get_url(url):
req = urllib.request.Request(url)
return urllib.request.urlopen(req)
expected_url = url % count
response = get_url(expected_url)
while (response.url == expected_url):
print("GET {0}".format(expected_url))
count += 9
expected_url = url % count
response = get_url(expected_url)
Try the below approach to exhaust all the items in different pages and break out of loop when there is no more items available.
from bs4 import BeautifulSoup
import requests
url = "http://www.belanjamimo.net/foundation-bb-cream/?o=a&s={}"
page = 0
while True:
res = requests.get(url.format(page))
soup = BeautifulSoup(res.text,"lxml")
items = soup.select(".product-block h2 a")
if len(items)<=1:break #check out if there is any product still available
for item in items:
print(item.text)
page+=9