Im trying to open a file using python,for storing the article extracted from url.
import urllib.request
from bs4 import BeautifulSoup
url = "http://www.thedrum.com/news/2015/07/29/mankind-must-get-ahead-technical-development-states-phds-mark-holden-following"
openfile = urlopen(url)
soup = BeautifulSoup(openfile)
f = open("test1.txt", "w")
for data in soup.find_all("p"):
sum1 = data.get_text()
f.writelines(sum1)
f.close()
But it shows the TypeError: 'HTTPResponse' object is not callablepointing to f = open("test1.txt", "w")
How do I solve this?
You need to make the soup object from the html content (that you get from the read function), not the response object
with urllib.request.urlopen(url) as r, open("test1.txt", "w") as f:
soup = BeautifulSoup(r.read().decode('utf-8'))
for data in soup.find_all("p"):
sum1 = data.get_text()
f.writelines(sum1)
Related
I hope you're well. Could you please tell why I can use my scraping script properly :)
It works with other website. I'm a beginner so I probably made a basic mistake
import requests
from bs4 import BeautifulSoup
import time
import csv
links = []
for i in range(1):
url = '*******/recettes/?page={}' + str(i)
res = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'})
response = requests.get(url)
print(response)
if response.ok:
print('Page: ' + str(i))
soup = BeautifulSoup(response.text, "html.parser")
divs = soup.findAll('div', class_ = 'field-item even')
for div in divs:
a = div.find('a')
link = a['href']
links.append('*******' + link)
time.sleep(3)
print(len(links))
with open('urls3.txt', 'w') as file:
for link in links:
file.write(link + '\n')
"""
with open('urls3.txt', 'r') as inf:
with open('recipes3.csv', 'w') as outf:
outf.write('titre,image,url,\n')
for row in inf:
url = row.strip()
response = requests.get(url)
if response.ok:
soup = BeautifulSoup(response.text, "html.parser")
titre = soup.find('h1')
image = soup.find('img', {"id":"recipe-media-viewer-thumbnail-1"})['src']
print(titre.text, image, url)
outf.write(str(titre.text) + ',' + str(image) + ',' + str(url) + '\n')
time.sleep(1)
"""
Could you please tell me why there is a mistake here :
<Response [200]>
Page: 0
Traceback (most recent call last):
File "ex3.py", line 18, in <module>
link = a['href']
TypeError: 'NoneType' object is not subscriptable
I've found the answer, I post it here :) for anyone interested
try:
image = soup.find('img', {"id":"recipe-media-viewer-thumbnail-1"})['src']
except Exception as e:
image = None
I am trying to download pdf report from web using python, however the code is returning a blank pdf report at the end, may i know whats wrong with the code and where i am going wrong.
=============================================
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen("http://www.imd.gov.in/Welcome%20To%20IMD/Welcome.php")
soup = BeautifulSoup(html_page)
b = soup.findAll('a', attrs={'href': re.compile("^http://hydro.imd.gov.in/hydrometweb/")})
c = b[0]['href']
d = c[0:len(c)-12]
e = d + "PdfReportPage.aspx?ImgUrl=PRODUCTS/Rainfall_Statistics/Cumulative/District_RF_Distribution/DISTRICT_RAINFALL_DISTRIBUTION_COUNTRY_INDIA_cd.PDF"
def download_file(download_url):
response = urllib2.urlopen(download_url)
file = open("document.pdf", 'w')
file.write(response.read())
file.close()
print("Completed")
download_file(e)
Use the binary mode b
Ex:
def download_file(download_url):
response = urllib2.urlopen(download_url)
with open("document.pdf", 'wb') as outfile:
outfile.write(response.read())
print("Completed")
download_file(e)
I need some help with the following code:
import csv
import requests
from bs4 import BeautifulSoup
import datetime
filename = "imob_" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Localizare","Pret","Data"])
for i in range(1,100):
r = requests.get("https://www.imobiliare.ro/inchirieri-case-vile/brasov?pagina="+format(i))
soup = BeautifulSoup(r.text, "html.parser")
array_price= soup.find_all('div', class_='pret')
array_desc=soup.find_all('h2', class_='titlu-anunt hidden-xs',text=True)
for iterator in range(0,len(array_price)):
localizare = array_desc[iterator].text.strip()
pret = array_price[iterator].text.strip()
writer.writerow([localizare, pret, datetime.datetime.now()])
The output is empty. Can someone give me an advice, please? Thank you.
You had a couple of issues:
First as stated in the comments the class price does not exist. You could use pret but it's easier to use soup.find_all('span', class_="pret-mare")
Second array_desc=soup.find_all('h2', class_='titlu-anunt hidden-xs',text=True) is returning empty. I removed text=True and it started working.
import csv
import requests
from bs4 import BeautifulSoup
import datetime
filename = "imob_" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Localizare","Pret","Data"])
for i in range(1,100):
r = requests.get("https://www.imobiliare.ro/inchirieri-case-vile/brasov?pagina="+format(i))
soup = BeautifulSoup(r.text, "html.parser")
array_price = soup.find_all('span', class_="pret-mare")
array_desc=soup.find_all('h2', class_='titlu-anunt hidden-xs')
for iterator in range(0,len(array_price)):
localizare = array_desc[iterator].text.strip()
pret = array_price[iterator].text.strip()
writer.writerow([localizare, pret, datetime.datetime.now()])
#coding:utf-8
import requests
from bs4 import BeautifulSoup
url = 'http://news.qq.com/'
wbdata = requests.get(url).text
soup = BeautifulSoup(wbdata,'lxml')
news_title = soup.select("div.text > em.f14 > a.linkto")
for n in news_title:
title = n.get_text()
link = n.get("href")
data = {"标题":title,"链接":link}
print(data)
f = open('news.txt','wb')
f.write(data)
f.close()
Here are codes.
So when I run it,it gives"TypeError: a bytes-like object is required, not 'dict'",I tried many solutions,no help.
Can someone help me?
thx!
f.write(data)
This is where the problem is.
You are passing in a dictionary instead of a byte like object.
For example when I change your code to the following:
#coding:utf-8
import requests
from bs4 import BeautifulSoup
url = 'http://news.qq.com/'
wbdata = requests.get(url).text
soup = BeautifulSoup(wbdata,'lxml')
news_title = soup.select("div.text > em.f14 > a.linkto")
for n in news_title:
title = n.get_text()
link = n.get("href")
data = {"k":title,"a":link}
print(data)
f = open('news.txt','wb')
data = b'123'
f.write(data)
f.close()
... I get the following:
{'k': '辽宁舰将绕台一周“武吓”蔡英文?外交部回应', 'a': 'http://news.qq.com/a/20170104/031454.htm'} ...
Which I assume is what you want.
Alternatively change the line:
f = open('news.txt', 'wb')
to
f = open('news.txt', 'w')
and that way you can write in str rather than a byte-like object.
In any case you shouldn't be passing in a dict.
Maybe you should open the file before write the title and the link,when you write end close the file .
f = open('news.txt','wb')
for n in news_titles:
title = n.get_text()
link = n.get("href")
data= {
'标题':title,
'链接':link
}
f.write(data['标题'])
f.write(':')
f.write(data['链接'])
f.write('\r\n')
f.close()
I've been stuck trying to transfer the data that I scraped to a csv file. Here is my code:
import requests, bs4, csv, sys
reload(sys)
sys.setdefaultencoding('utf-8')
url = 'http://www.constructeursdefrance.com/resultat/?dpt=01'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text,'html.parser')
links = []
for div in soup.select('.link'):
link = div.a.get('href')
links.append(link)
for i in links:
url2 = i
res2 = requests.get(url2)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
data = each.text, each.next_sibling
with open('french.csv', 'wb') as file:
writer = csv.writer(file)
writer.writerows(data)
the output says:
Traceback (most recent call last):
File "test_new_project.py", line 23, in <module>
writer.writerows(data)
csv.Error: sequence expected
But I am trying to put tuples into the csv file, and as long as I know csv accepts tuples and lists. How can I fix this problem?
Atirag is correct, but you have another issue which is that your with call to open the output file is nested within a for loop. So if there is more than one link, the file will be overwritten each time and the output will not be what you expect. I think this should generate the output you intend:
for div in soup.select('.link'):
link = div.a.get('href')
links.append(link)
with open("french.csv", "w") as file:
writer = csv.writer(file)
for i in links:
res2 = requests.get(i)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
writer.writerow([each.text, each.next_sibling])
Change this
for each in soup2.select('li > strong'):
data = each.text, each.next_sibling
to this
data=[]
for each in soup2.select('li > strong'):
data.append((each.text, each.next_sibling))
your data variable was one tuple not a list of tuples. The above code creates a list of tuples.
Other solution is this (mind the indentation)
data = []
for i in links:
url2 = i
res2 = requests.get(url2)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
data.append((each.text, each.next_sibling))
with open('french.csv', 'wb') as file:
writer = csv.writer(file)
writer.writerows(data)