When I was trying to get some house information on this site(https://cd.lianjia.com/ershoufang/106101326994.html), I had a problem to get the ''contents'' in the statements'<span> contents <span>==$0' with beautifulsoup4 module, I always got a '0', not the contents.enter image description here.Thanks a lot!
here is my code:
import requests
from bs4 import BeautifulSoup
from Headers import headers
def getSigleHouseDetail(houseurl):
result = {}
res = requests.get(houseurl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
result['totalcount'] = soup.select('.totalCount')[0].select('span')[0].text
return result
url = 'https://cd.lianjia.com/ershoufang/106101326994.html'
print(getSigleHouseDetail(url)['totalcount'])
what you are doing now is printing the index of the object you are creating from line :
result['totalcount'] = soup.select('.totalCount')[0].select('span')[0].text
rather you should capture the content or using attributes such as class, id , and others
import requests
from bs4 import BeautifulSoup
def getSigleHouseDetail(houseurl):
res = requests.get(houseurl)
soup = BeautifulSoup(res.text,'html.parser',from_encoding='utf-8')
method_divs = soup.body.find_all('span', attrs= {'class': 'className'})
return method_divs[0].text
url = 'https://cd.lianjia.com/ershoufang/106101326994.html'
print(getSigleHouseDetail(url))
The line :
return method_divs[0].text
will print the text of first span with the className
Thanks for all your answers.I found the contents in the statement '<span> contents <span>==$0'can be found in a javescript data.
Related
I'm tring to remove the extra space and "rebtel.bootstrappedData" in the second alinea but for some reason it won't work.
This is my output
"welcome_offer_cuba.block_1_title":"SaveonrechargetoCuba","welcome_offer_cuba.block_1_cta":"Sendrecharge!","welcome_offer_cuba.block_1_cta_prebook":"Pre-bookRecarga","welcome_offer_cuba.block_1_footprint":"Offervalidfornewusersonly.","welcome_offer_cuba.block_2_key":"","welcome_offer_cuba.block_2_title":"Howtosendarecharge?","welcome_offer_cuba.block_2_content":"<ol><li>Simplyenterthenumberyou’dliketosendrechargeinthefieldabove.</li><li>Clickthe“{{buttonText}}”button.</li><li>CreateaRebtelaccountifyouhaven’talready.</li><li>Done!Yourfriendshouldreceivetherechargeshortly.</li></ol>","welcome_offer_cuba.block_3_title":"DownloadtheRebtelapp!","welcome_offer_cuba.block_3_content":"Sendno-feerechargeandenjoythebestcallingratestoCubainoneplace."},"canonical":{"string":"<linkrel=\"canonical\"href=\"https://www.rebtel.com/en/rates/\"/>"}};
rebtel.bootstrappedData={"links":{"summary":{"collection":"country_links","ids":[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],"params":{"locale":"en"},"meta":{}},"data":[{"title":"A","links":[{"iso2":"AF","route":"afghanistan","name":"Afghanistan","url":"/en/rates/afghanistan/","callingCardsUrl":"/en/calling-cards/afghanistan/","popular":false},{"iso2":"AL","route":"albania","name":"Albania","url":"/en/rates/albania/
And this is the code I used:
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.rebtel.com/en/rates/"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
x = range(132621, 132624)
script = soup.find_all("script")[4].text.strip()[38:]
print(script)
What should I add to "script" so it will remove the empty spaces?
Original answer
You can change the definition of your script variable by :
script = soup.find_all("script")[4].text.replace("\t", "")[38:]
It will remove all tabulations on your text and so the alineas.
Edit after conversation in the comments
You can use the following code to extract the data in json :
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.rebtel.com/en/rates/"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
script = list(filter(None, soup.find_all("script")[4].text.replace("\t", "").split("\r\n")))
app_data = json.loads(script[1].replace("rebtel.appData = ", "")[:-1])
bootstrapped_data = json.loads(script[2].replace("rebtel.bootstrappedData = ", ""))
I extracted the lines of the script with split("\r\n") and get the wanted data from there.
I'm trying to scrap some links from a site but I'm running into an issue where my for loop will stop at the first link.
Currently What I have
import requests
import lxml
from bs4 import BeautifulSoup
url = 'http://ufcstats.com/statistics/fighters?char=a'
f = requests.get(url)
soup = BeautifulSoup(f.content, 'lxml')
fighter_links = soup.find('td', {
'class': 'b-statistics__table-col'
}).find_all('a')
fighterLinks = []
for anchor in fighter_links:
# urls = anchor['href']
fighterLinks.append(anchor['href'])
print(fighterLinks)
When I print I'm getting
['http://ufcstats.com/fighter-details/93fe7332d16c6ad9']
Site I'm trying to pull from
when you do
fighter_links = soup.find('td', {'class': 'b-statistics__table-col'}).find_all('a')
you are only getting the first table record. soup.find will only return the first match that it finds. what you need to do is change it to
fighter_links = soup.find_all('td', {'class': 'b-statistics__table-col'})
fighterLinks = []
that will get you all the table enteries that match your class name, and from there you need to do loop to extract out the links
for link in fighter_links:
if(link.find('a')):
fighterLinks.append(link.find('a').get('href'))
I don't know if this will help, but I hope it does:
import requests
from bs4 import BeautifulSoup
url = 'http://ufcstats.com/statistics/fighters?char=a'
f = requests.get(url)
soup = BeautifulSoup(f.content, 'lxml')
aa = soup.select("a.b-link_style_black")
fighterLinks = []
for i in aa:
for k in i:
fighterLinks.append(aa[aa.index(i)].attrs["href"])
print(fighterLinks)
outputs:
['http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/1c5879330d42255f', 'http://ufcstats.com/fighter-details/1c5879330d42255f', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a']
Requests will fail on some connections in this instance. Better use cloudscraper: (pip install cloudscraper)
import cloudscraper
from bs4 import BeautifulSoup
scraper = cloudscraper.create_scraper()
soup = BeautifulSoup(scraper.get("http://ufcstats.com/statistics/fighters?char=a").text)
links = soup.select_one('.b-statistics__table').select('a')
print(set([x.get('href') for x in links]))
This returns:
{'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/1c5879330d42255f'}
Hi how to scrape text from div without any class? At first I try to scrape all data from div with class 'jobs page' then without class value but it doesn't work.
from bs4 import BeautifulSoup
import requests
a = {}
def antal_pl(name=''):
try:
page_response = requests.get('https://antal.pl/oferty-pracy?s=&sid=&did=Accountancy', timeout=40).text
page_content = BeautifulSoup(page_response, 'lxml')
data = page_content.find_all(class_ = 'jobs_page')
data_in = data.find_all('div', class_ = None)
print(data_in)
except:
''
antal_pl( name='Accontancy')
Try the below approach to get the text out of that webpage as you mentioned above. I've tried to organize your code a little to make it look cleaner.
from bs4 import BeautifulSoup
import requests
URL = "https://antal.pl/oferty-pracy?s=&sid=&did={}"
def antal_pl(name):
res = requests.get(URL.format(name))
soup = BeautifulSoup(res.text, 'lxml')
data = soup.find(class_='header').find_next_sibling().text.strip()
print(data)
if __name__ == '__main__':
antal_pl("Accountancy")
Result:
Znaleziono 47 ofert pracy.
use XPATH
html = etree.HTML(wb_data)
html_data = html.xpath('/html/body/div/ul/li/a')
enter image description here
Here is the URL that I'am using:
http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i
In fact on this page, the link that I am looking for appears may be 5 second after loading the page.
I see after 5 second a post request to :
http://www.protect-stream.com/secur.php
with data like so :
k=2AE_a,LHmb6kSC_c,sZNk4eNixIiPo_c,_c,Gw4ERVdriKuHJlciB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WxBkmAtz75kpYcrHzxtYt32hCYSp0WjqOQR9bY_a,ofQtw_b,
I didn't get from where the 'k' value come from ?
Is their an idea on how we could get the 'k' value using python ?
This is not going to be trivial. The k parameter value is "hidden" deep inside a script element inside nested iframes. Here is a requests + BeautifulSoup way to get to the k value:
import re
from urlparse import urljoin
# Python 3: from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
base_url = "http://www.protect-stream.com"
with requests.Session() as session:
response = session.get("http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i")
# get the top frame url
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="frame.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the nested frame url
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="w.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the frame HTML source and extract the "k" value
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
script = soup.find("script", text=lambda text: text and "k=" in text).get_text(strip=True)
k_value = re.search(r'var k="(.*?)";', script).group(1)
print(k_value)
Prints:
YjfH9430zztSYgf7ItQJ4grv2cvH3mT7xGwv32rTy2HiB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WXhmwUC0ipkPRkLQepYHLyF1U0xvsrzHMcK2XBCeY3_a,O_b,
My code:
from urllib2 import urlopen
from bs4 import BeautifulSoup
url = "https://realpython.com/practice/profiles.html"
html_page = urlopen(url)
html_text = html_page.read()
soup = BeautifulSoup(html_text)
links = soup.find_all('a', href = True)
files = []
def page_names():
for a in links:
files.append(a['href'])
return files
page_names()
print files[:]
base = "https://realpython.com/practice/"
print base + files[:]
I'm trying to parse out three webpage file names and append them to "files" list, then somehow append or add them to the end of the base url for a simple print.
I've tried making "base" a single item list so I could append, but I am rather new to Python and believe I'm screwing up my for statement.
Currently I get:
print files[:]
TypeError: 'type' object has no attribute '__getitem__'
At the last you have defined list[:], it's completely wrong since list is a built-in keyword for creating actual list.
from urllib2 import urlopen
from bs4 import BeautifulSoup
url = "https://realpython.com/practice/profiles.html"
html_page = urlopen(url)
html_text = html_page.read()
soup = BeautifulSoup(html_text)
links = soup.find_all('a', href = True)
files = []
def page_names():
for a in links:
files.append(a['href'])
page_names()
base = "https://realpython.com/practice/"
for i in files:
print base + i
Output:
https://realpython.com/practice/aphrodite.html
https://realpython.com/practice/poseidon.html
https://realpython.com/practice/dionysus.html
And you don't need to create intermediate list for storing links or files just use list_comprehension.
from urllib2 import urlopen
from bs4 import BeautifulSoup
url = "https://realpython.com/practice/profiles.html"
html_page = urlopen(url)
html_text = html_page.read()
soup = BeautifulSoup(html_text)
files = [i['href'] for i in soup.find_all('a', href = True)]
base = "https://realpython.com/practice/"
for i in files:
print base + i