This is my first time posting so apologies if there is any errors. I currently have a file with a list of URLs, and I am trying to create a python program which will go to the URLs and grab the text from the HTML page and save it in a .txt file. I am currently using beautifulsoup to scrape these sites and many of them are throwing errors which I am unsure how to solve. I am looking for a better way to this: I have posted by code below.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from urllib.request import Request
import datefinder
from dateutil.parser import parse
import json
import re
import random
import time
import scrapy
import requests
import urllib
import os.path
from os import path
#extracts page contents using beautifulSoup
def page_extract(url):
req = Request(url,
headers={'User-Agent': 'Mozilla/5.0'})
webpage = uReq(req, timeout=5).read()
page_soup = soup(webpage, "lxml")
return page_soup
#opens file that contains the links
file1 = open('links.txt', 'r')
lines = file1.readlines()
#for loop that iterates through the list of urls I have
for i in range(0, len(lines)):
fileName = str(i)+".txt"
url = str(lines[i])
print(i)
try:
#if the scraping is successful i would like it to save the text contents in a text file with the text file name
# being the index
soup2 = page_extract(url)
text = soup2.text
f = open("Politifact Files/"+fileName,"x")
f.write(str(text))
f.close()
print(url)
except:
#otherwise save it to another folder which contains all the sites that threw an error
f = open("Politifact Files Not Completed/"+fileName,"x")
f.close()
print("NOT DONE: "+url)
Thanks #Thierry Lathuille and #Dr Pi for your response. I was able to find a solution to this problem by looking into python libraries that are able to webscrape the important text off of a webpage. I came across one called 'Trafilatura' which is able to accomplish this task. The documentation for this library is here at: https://pypi.org/project/trafilatura/.
Related
I wrote this code in Python to take a list of 1000+ links in a csv file, search them and extract an email on the links page. However the output keeps maxing out at 1000 records even though my lists have over 1000 links. Any suggestions on how to fix this? Essentially I want to be able to use lists upwards of 5000 links/rows.
import requests
from bs4 import BeautifulSoup
import time
import csv
from urllib.request import urlopen
import re
import sys
start = time.time()
contents = []
with open('./SCLINKS.csv','r',encoding='utf-8-sig') as csvf:
urls = csv.reader(csvf)
for url in urls:
contents.append(url)
for url in contents:
try:
html_content = requests.get(url[0]).text
soup = BeautifulSoup(html_content, "lxml")
email = re.search(r'\w+#\w+\.\w+', soup.select_one('p[itemprop="description"]').text)
print(email.group())
except:
pass
print('It took', time.time()-start, 'seconds.')
sys.exit()
After running the following code, I am unable to open the downloaded PDF's. Even though the code ran successfully, the downloaded PDF files are damaged.
My computer's error message is
Unable to open file. it may be damaged or in a format Preview doesn't recognize.
Why are they damaged and how do I solve this?
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://github.com/sonhuytran/MIT8.01SC.2010F/tree/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual"
#If there is no such folder, the script will create one automatically
folder_location = r'/Users/rahelmizrahi/Desktop/ Physics_Solutions'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,link['href'])).content)
This issue is you are requesting the link that is within github 'blob' when you need the the 'raw' link:
'/sonhuytran/MIT8.01SC.2010F/blob/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual/A01_YOUN6656_09_ISM_FM.pdf'
but you want:
'/sonhuytran/MIT8.01SC.2010F/raw/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual/A01_YOUN6656_09_ISM_FM.pdf'
So just adjust that. Full code below:
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://github.com/sonhuytran/MIT8.01SC.2010F/tree/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual"
#If there is no such folder, the script will create one automatically
folder_location = r'/Users/rahelmizrahi/Desktop/Physics_Solutions'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
pdf_link = link['href'].replace('blob','raw')
pdf_file = requests.get('https://github.com' + pdf_link)
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(pdf_file.content)
I had to use soup.select("a[href$=.pdf]") (without the inner quotes) to get it to select the links correctly.
After that, your script works, but: what you're downloading is not a PDF, but an HTML webpage! Try visiting one of the URLs: https://github.com/sonhuytran/MIT8.01SC.2010F/blob/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual/A01_YOUN6656_09_ISM_FM.pdf
You'll be presented with a GitHub webpage, not the actual PDF. To get that, you need the "raw" GitHub URL, which you can see when you hover over the Download button: https://github.com/sonhuytran/MIT8.01SC.2010F/raw/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual/A01_YOUN6656_09_ISM_FM.pdf
So, it looks like you just have to replace blob with raw at the proper spot to make it work:
href = link['href']
href = href.replace('/blob/', '/raw/')
requests.get(urljoin(url,href).content)
The issue is that the file is not properly closed after the open/write.
Just add f.close() at the end of the code to do that.
I have been working on a csv based image scraper using beautifulsoup. This is becuase the links have to be modified before downloading.
This is the basis of the code :
import requests
import csv
from bs4 import BeautifulSoup
from urllib import urlretrieve
import csv
import os
import sys
url = '..............'
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
with open('output.csv', 'wb') as f:
bsoup_writer = csv.writer(f)
for link in soup.find_all('a',{'class': '........'}):
bsoup_writer.writerow([link.get('href')])
This is just part of the main code and it works very well on the page/link you're at. With that said I would like to use other csv file (this would be the crawling file) with list of links to feed to this code/py program so it could download from each link in that csv file. Hence is it possible to modify the url variable to call the csv file and iterate over the links in it?
I'm trying to get other subset URLs from a main URL. However,as I print to see if I get the content, I noticed that I am only getting the HTML, not the URLs within it.
import urllib
file = 'http://example.com'
with urllib.request.urlopen(file) as url:
collection = url.read().decode('UTF-8')
I think this is what you are looking for.
You can use beautiful soup library of python and this code should work with python3
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
def get_all_urls(url):
open = urlopen(url)
url_html = BeautifulSoup(open, 'html.parser')
for link in url_html.find_all('a'):
links = str(link.get('href'))
if links.startswith('http'):
print(links)
else:
print(url + str(links))
get_all_urls('url.com')
I am a beginner in Python and web scraping but I am really interested. What I want to do is to extract the total number of search results per day.
If you open it, you will see here:
Used Cars for Sale
Results 1 - 20 of 30,376
What I want is only the number 30,376. Is there any way to extract it on a daily basis automatically and save it to an excel file please? I have played around some packages in Python but all I got is error messages and something not relevant like below:
from bs4 import BeautifulSoup
from urllib.request import urlopen
base_url = "..."
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html, "lxml")
make_soup(base_url)
Can someone show me how to extract that particular number please? Thanks!
Here is the one way through requests module and soup.select function.
from bs4 import BeautifulSoup
import requests
base_url = "http://www.autotrader.co.nz/used-cars-for-sale"
def make_soup(url):
html = requests.get(url).content
soup = BeautifulSoup(html, "lxml")
txt = soup.select('#result-header .result-count')[0].text
print txt.split()[-1]
make_soup(base_url)
soup.select accepts an css selector as argument. This #result-header .result-count selector means find the element having result-count class which was inside an element having result-header as id.
from bs4 import BeautifulSoup
from urllib.request import urlopen
base_url = "http://www.autotrader.co.nz/used-cars-for-sale"
html = urlopen(base_url).read()
soup = BeautifulSoup(html, 'lxml')
result_count = soup.find(class_="result-count").text.split('of ')[-1]
print(result_count)
out:
30,376
from bs4 import BeautifulSoup
import requests, re
base_url = "http://www.autotrader.co.nz/used-cars-for-sale"
a = BeautifulSoup(requests.get(base_url).content).select('div#result-header p.result-count')[0].text
num = re.search('([\w,]+)$',a)
print int(num.groups(1)[0].replace(',',''))
Output:
30378
Will get any other number also which is at the end of the statement.
Appending new rows to an Existing Excel File
Script to append today's date and the extracted number to existing excel file:
!!!Important!!!: Don't run this code directly on your main file. Instead make a copy of it first and run on that file. If it works properly then you can run it on your main file. I'm not responsible if you loose your data :)
import openpyxl
import datetime
wb = openpyxl.load_workbook('/home/yusuf/Desktop/data.xlsx')
sheet = wb.get_sheet_by_name('Sheet1')
a = sheet.get_highest_row()
sheet.cell(row=a,column=0).value=datetime.date.today()
sheet.cell(row=a,column=1).value=30378 # use a variable here from the above (previous) code.
wb.save('/home/yusuf/Desktop/data.xlsx')