Python: Download papers from ScienceDirect by DOI with requests - python

I have an excel list of DOIs of papers I'm interested in. Based on this list, I would like to download all the papers.
I tried to do it with request, as recommended in their documentation. But the pdf files I get are damaged. They are just some KB big. I changed the chunk_size several times from None till 1024*1024 and I have read many posts already. Nothing helps.
Please, what are your ideas?
import pandas as pd
import os
import requests
def get_pdf(doi, file_to_save_to):
url = 'http://api.elsevier.com/content/article/doi:'+doi+'?view=FULL'
headers = {
'X-ELS-APIKEY': "keykeykeykeykeykey",
'Accept': 'application/pdf'
}
r = requests.get(url, stream=True, headers=headers)
if r.status_code == 200:
for chunk in r.iter_content(chunk_size=1024*1024):
file_to_save_to.write(chunk)
return True
doi_list = pd.read_excel('list.xls')
doi_list.columns = ['DOIs']
count = 0
for doi in doi_list['DOIs']:
doi = doi.replace('DOI:','')
pdf = doi.replace('/','%')
if not os.path.exists(f'path/{pdf}.pdf'):
file = open(f'path/{pdf}.pdf', 'wb')
get_pdf(doi, file)
count += 1
print(f"Dowloaded: {count} of {len(doi_list['DOIs'])} articles")

I think your problem is the return True in for chunk in r.iter_content. With that line, you'll only ever write one chunk of the PDF of size chunk_size.
You should also open files using with; as is, you'll never close the file handles.
import pandas as pd
import os
import requests
HEADERS = {
'X-ELS-APIKEY': "keykeykeykeykeykey",
'Accept': 'application/pdf'
}
def get_pdf(doi, file_to_save_to):
url = f'http://api.elsevier.com/content/article/doi:{doi}?view=FULL'
with requests.get(url, stream=True, headers=HEADERS) as r:
if r.status_code == 200:
for chunk in r.iter_content(chunk_size=1024*1024):
file_to_save_to.write(chunk)
doi_list = pd.read_excel('list.xls')
doi_list.columns = ['DOIs']
count = 0
for doi in doi_list['DOIs']:
doi = doi.replace('DOI:','')
pdf = doi.replace('/','%')
if not os.path.exists(f'path/{pdf}.pdf'):
with open(f'path/{pdf}.pdf', 'wb') as file:
get_pdf(doi, file)
count += 1
print(f"Dowloaded: {count} of {len(doi_list['DOIs'])} articles")

Related

Scraping phones and ZIPs URLs from CSV to CSV

I need to scrape a list of URLs stored in a CSV and export to another CSV. I must make some mistake cause I can't run it. So if anyone can help me, I appreciate.
I'm very new in Python and also unite some codes so I have some issues to identificate where is the problem. I mixed a code that import an CSV and another code that require an string search.
import scrapy
from scrapy import Spider
from scrapy import Request
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen,urlparse, Request,HTTPError
import re
import numpy as np
import csv
from http.client import BadStatusLine
import ssl
The following is the code that I have so far.
phn_1 = []
zipcode_1 = []
err_msg_zipcode = []
err = []
class Spider:
name = 'spider'
# read csv with just url per line
with open('urls.csv') as file:
start_urls = [line.strip() for line in file]
def start_request(self):
request = Request(url = self.start_urls, callback=self.parse)
yield request
def parse(self, response):
s = response.body
soup = BeautifulSoup(html, 'lxml')
text = soup.get_text()
df2=pd.DataFrame()
phn_1 = [] #store all the extracted Phn numbers in a List
mail_1 = [] #store all the extracted Zipcode in a List
for line in df2.iterrows(): # Parse through each url in the list.
try:
try:
req1 = Request(row[1]['URL'], headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'})
gcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23) # Bypass SSL certification verification
f = urlopen(req1, context=gcontext)
url_name = f.geturl() #extract URL name
s = f.read()
phone = re.findall(r'\d{3}-\d{3}-\d{4}', s, re.MULTILINE)
zipcode = re.findall(r'(?<=, [A-Z]{2} )\d{5}', s, re.MULTILINE)
if len(phone) == 0:
print("No phone number found.")
err_msg_phn = "No phone number found."
phn_1.append((url_name, err_msg_phn))
else:
count = 1
for item in phone:
phn_1.append((url_name,item))
count += 1
print(phn_1)
if len(zipcode) == 0:
print("No zipcode found.")
err_msg_zipcode = "No zipcode address found."
zipcode_1.append((url_name,err_msg_zipcode))
else:
count = 1
for item in zipcode:
mail_1.append((url_name,item))
count += 1
print(mail_1)
except BadStatusLine: # Catch if invalid url names exist
print("could not fetch %s" % url_name)
except urllib3.request.HTTPError as err: # catch HTTP 404 not found error
if err == 404:
print("Received HTTPError on %s" % url_name)
df_p = pd.DataFrame()
df_m = pd.DataFrame()
df_final = pd.DataFrame()
df_p = pd.DataFrame(phn_1,columns=['URL','Phone_No']) # Dataframe for url and Phn number
df_phn = df_p.drop_duplicates(subset=['URL', 'Phone_No'], keep='first') #remove duplicates
df_m = pd.DataFrame(zipcode_1,columns=['URL','Zipcode']) # Dataframe for url and Zipcode
df_mail = df_m.drop_duplicates(subset=['URL','Zipcode'], keep='first') #remove duplicates
df_final = pd.merge(df_phn,df_mail, on = 'URL', how = 'inner') #Merge two dataframes on the common column
#df_final.groupby(['URL'], as_index=False)
df_final.to_csv('result_contact.csv', index=False, encoding='utf-8')
#convert the csv output to json
with open('result_contact.csv') as f:
reader = csv.DictReader(f)
rows = list(reader)
Thank you!!!
One obvious mistake I see is here:
request = Request(url = self.start_urls, callback=self.parse)
url should be a string, but you are sending a list. If you want to send multiple requests, you need to use a loop. As you are already setting start_urls and using the parse callback, you do not need to override start_requests. The default implementation should take care of it.
You may want to consider setting the start_urls in the __init__ method.

How can we read JSON data from URL, convert to dataframe, and save as CSV

I'm playing around with some code to read JSON encoded data from a URL, push it into a data frame and save the results to a CSV. The code that I attempted to run is shown below. I think this is pretty close, but something is wrong, because nothing gets downloaded.
import urllib
from urllib.request import urlopen
import json
import pandas as pd
from pandas.io.json import json_normalize
all_links = ['https://www.baptisthealthsystem.com/docs/global/standard-charges/474131755_abrazomaranahospital_standardcharges.json?sfvrsn=9a27928_2',
'https://www.baptisthealthsystem.com/docs/global/standard-charges/621861138_abrazocavecreekhospital_standardcharges.json?sfvrsn=674fd6f_2',
'https://www.baptisthealthsystem.com/docs/global/standard-charges/621809851_abrazomesahospital_standardcharges.json?sfvrsn=13953222_2',
'https://www.baptisthealthsystem.com/docs/global/standard-charges/621811285_abrazosurprisehospital_standardcharges.json?sfvrsn=c8113dcf_2']
for item in all_links:
#print(item)
try:
length = len(item)
first_under = item.find('_') + 1
last_under = item.rfind('?') - 21
file_name = item[first_under:last_under]
print(file_name)
# store the response of URL
response = urlopen(item)
data = json.loads(response.read())
#print(type(data))
data = json.loads(item.read().decode())
df = pd.DataFrame(json_normalize(data, 'metrics'), encoding='mac_roman')
DOWNLOAD_PATH = 'C:\\Users\\ryans\\Desktop\\hospital_data\\' + file_name + '.csv'
urllib.request.urlretrieve(df,DOWNLOAD_PATH)
except Exception as e: print(e)
Any thoughts on what could be wrong here?

How to write all requests to files?

Imagine you have the following list:
listfiles =
['https://yourubl.nl/wp-content/uploads/elementor/forms/60916b7e4f600.pdf', 'https://yourubl.nl/wp-content/uploads/elementor/forms/60916d04e0d70.pdf', 'https://yourubl.nl/wp-content/uploads/elementor/forms/60917c5a5c95f.pdf']
# here we are importing the library
import requests
from requests.auth import HTTPBasicAuth
# making the request
listfiles = []
for url in url_list:
response = requests.get(url, auth=HTTPBasicAuth('c101838', 'HQSRynw9'))
listfiles.append(response)
print(listfiles)
for files in listfiles:
for kvk in kvk_list:
with open(kvk + '.pdf', 'wb') as f:
f.write(files.content)
Now I want to write each of the responses to a pdf file.
kvk_list = ['88888888', '9999999', '4444444']
url_list = [<Response [200]>, <Response [200]>, <Response [200]>]
However, I'm only getting the last pdf file in all three outputs....
How is this possible?
Please help!
Use a zip() function to tie the kvk_list and listfiles together:
for kvk, files in zip(kvk_list, listfiles):
with open(kvk + '.pdf', 'wb') as f:
f.write(files.content)

Python : download image from url but receiving HTTP Error 404

I'm trying to download an image from a website but I get a 404 error. I tried to add a user agent with no sucess.
Here is the code:
import requests
import shutil
with open(r'C:\Users\home\Desktop\urls.csv') as file:
csv = []
for row in file:
csv.append(row.split(";"))
row = 0
while row < len(csv):
r = requests.get(csv[row][0], stream=True, headers={'User-agent': 'Mozilla/5.0'})
if r.status_code == 200:
with open(r"C:\Users\home\Desktop\images\house" + str(row) + ".jpg", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
row +=1
The url is:
https://example.com/wp-content/uploads/2018/10/toronto-curbed-8.jpg
replace example by cdn.icepop

Copy cell images from Smartsheet using Python

I am trying to make a copy of smart sheet data on my local disk. I am able to copy all the smart sheet data except for the cell images. Below is the code am using. This code works perfectly fine to copy the data but not the cell images
NOTE: I am not trying to copy the attachments from smart sheets; only the cell the images and data.
Could someone help me to enhance this code to copy the cell images as well?
import json
import os
import requests
import time
token = "Bearer <TOken>"
backed_up_sheets = {"Attach": 86960044478894,"test2":6659760455684}
dir = r'C:\Users\\me\SmartSheetsBackup\WorkSheet' + time.strftime("-%m_%d_%Y_%H_%M")
API_URL = "https://api.smartsheet.com/2.0/sheets/"
payload = {"Authorization": token,
"Accept": "application/vnd.ms-excel,image/*"}
amount = len(backed_up_sheets)
i = 1
for el in backed_up_sheets:
r = requests.get(API_URL + str(backed_up_sheets[el]) , headers=payload)
if r.status_code != 200:
print ('Some problem with connections please retry later0')
pass
if not os.path.exists(dir):
os.makedirs(dir)
with open(dir + el + time.strftime("-%m_%d_%Y_%H_%M") + ".xlsx", 'wb') as output:
output.write(r.content)
print ('Progress in sheets: ' + str(i) + '/' + str(amount))
i += 1
Here's a complete code sample:
# Download an image in a cell
def download_cell_image(client, sheet_id, row_id, column_id, default_filename):
# Get desired row
row = client.Sheets.get_row(sheet_id, row_id)
cell = row.get_column(column_id)
image = cell.image
filename = getattr(image, 'alt_text', default_filename)
# Obtain a temporary image URL
imageUrl = ss_client.models.ImageUrl( { "imageId": image.id } )
response = ss_client.Images.get_image_urls([imageUrl])
url = response.image_urls[0].url
# Download the image
import requests
response = requests.get(url)
if response.status_code == 200:
with open(filename, 'wb') as f:
f.write(response.content)
Note that this requires SDK version 1.3.0 or later
The same steps illustrated in the cURL example should work in Python. (Apologies that we don't have an complete published sample)
Get the image id from the cell object, as returned from get_sheet
Convert the image id to a download url, using images.get_image_urls (docs)
Download the image from the url, probably using the Requests library.

Categories

Resources