How do I retrieve a hyperlink URL from email within python? - python

I have the following code to get an HTML email as a email.Message object but am unsure how to proceed from here to find the url of the hyperlink given its display name. I have located the url in the payload(0). In addition the email is a href=3D so it doesnt work if i just copy the link and paste into a browser.
import sys
import imaplib
import getpass
import email
import datetime
import email
M = imaplib.IMAP4_SSL('imap.gmail.com')
M.login('email#email.com','password123')
rv,boxes = M.list()
rv,boxes = M.select('Inbox/Test1')
rv, data = M.search(None, 'ALL')
typ, msg_data = M.fetch('1', '(RFC822)')
msg = email.message_from_string(msg_data[0][1])
url_name = 'Click Here'
html_text = msg.get_payload(0)

this will show all href in the message...one can update the parseLinks class to choice their individual string they are interested in.
import imaplib
import email
import quopri
import HTMLParser
class parseLinks(HTMLParser.HTMLParser):
def handle_starttag(self, tag, attrs):
global global_futures_fair_value
if tag == 'a':
for name, value in attrs:
if name == 'href':
print name
print value
M = imaplib.IMAP4_SSL('imap.gmail.com')
M.login('email#email.com','password123')
M.select('Inbox/Test1')
rv, data = M.search(None, 'ALL')
typ, msg_data = M.fetch('1', '(RFC822)')
msg = email.message_from_string(msg_data[0][1])
url_name = 'Click Here'
html_text = msg.get_payload(0)
msg = str(msg.get_payload()[0])
msg = quopri.decodestring(msg)
linkParser = parseLinks()
linkParser.feed(msg)

Related

Scraping data from website using Beautiful Soup and Pandas

I have a python script that use the BeautifulSoup and Pandas packages in order to scrape data from a list of urls and convert the data into a dataframe then save it as excel file and send it by email as attachment.
The problem is that when the script run and finish the scraping of the first item it crash and return the error below:
ValueError: 15 columns passed, passed data had 14 columns
I think this means that there is a missing html tag right??
The list includes 3 urls.
code:
import time
from datetime import date
import smtplib
import requests
import pandas as pd
from bs4 import BeautifulSoup
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
from email.utils import formatdate
def scrape_website():
url_list = ["https://www.bayt.com/en/international/jobs/executive-chef-jobs/",
"https://www.bayt.com/en/international/jobs/head-chef-jobs/",
"https://www.bayt.com/en/international/jobs/executive-sous-chef-jobs/"]
for url in url_list:
soup = BeautifulSoup(requests.get(url).content, "lxml")
links = []
for a in soup.select("h2.m0.t-regular a"):
if a['href'] not in links:
links.append("https://www.bayt.com" + a['href'])
joineddd = []
for link in links:
s = BeautifulSoup(requests.get(link).content, "lxml")
alldd = [dd.text for dd in s.select(
"div[class='card-content is-spaced'] dd")]
alldd.insert(0, link)
joineddd.append(alldd)
print("Web Crawling is Done for {}".format(url))
convert_to_dataFrame(joineddd)
send_email()
def remove_unwanted_cols(dataset, cols):
for col in cols:
del dataset[col]
return dataset
def convert_to_dataFrame(joineddd):
df = pd.DataFrame(joineddd, columns=[
"link", "location", "Company_Industry", "Company_Type",
"Job_Role", "Employment_Type", "Monthly_Salary_Range",
"Number_of_Vacancies", "Career_Level",
"Years_of_Experience", "Residence_Location",
"Gender","Nationality","Degree","Age"])
df = remove_unwanted_cols(df, ["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"])
df_to_excel = df.to_excel(r"F:\\AIenv\web_scrapping\\jobDesc.xlsx", index = False, header=True)
send_email()
def send_email():
'''send email '''
today = date.today()
file = 'F:\\AIenv\web_scrapping\\jobDesc.xlsx'
username='XXXXXXXXXXX'
password='XXXXXXXXXXXXX'
send_from = 'XXXXXXXXXXXXX'
send_to = 'XXXXXXXXXXXXXX'
Cc = 'recipient'
msg = MIMEMultipart()
msg['From'] = send_from
msg['To'] = send_to
msg['Cc'] = Cc
msg['Date'] = formatdate(localtime=True)
msg['Subject'] = 'Hello, This is a test mail {}'.format(today)
server = smtplib.SMTP('smtp.gmail.com')
port = '587'
fp = open(file, 'rb')
part = MIMEBase('application', 'vnd.ms-excel')
part.set_payload(fp.read())
fp.close()
encoders.encode_base64(part)
part.add_header('Content-Disposition', 'attachment', filename='jobs Description--{}'.format(today))
msg.attach(part)
smtp = smtplib.SMTP('smtp.gmail.com')
smtp.ehlo()
smtp.starttls()
smtp.login(username, password)
smtp.sendmail(send_from, send_to.split(',') + msg['Cc'].split(','), msg.as_string())
smtp.quit()
print('Mail Sent')
if __name__ == "__main__":
scrape_website()
update func scrape_website(), save alldd as dictionary.
for link in links:
s = BeautifulSoup(requests.get(link).content, "lxml")
### update Start ###
alldd = dict()
alldd['link'] = link
dd_div = [i for i in s.select("div[class='card-content is-spaced'] div")
if ('<dd>' in str(i) ) and ( "<dt>" in str(i))]
for div in dd_div:
k = div.select_one('dt').get_text(';', True)
v = div.select_one('dd').get_text(';', True)
alldd[k] = v
### update End ###
joineddd.append(alldd)
# result
df = pd.DataFrame(joineddd)
alladd sample:
{
'link': 'https://www.bayt.com/en/qatar/jobs/executive-chef-4298309/',
'Job Location': 'Doha, Qatar',
'Company Industry': 'Real Estate; Hospitality & Accomodation; Catering, Food Service, & Restaurant',
'Company Type': 'Employer (Private Sector)',
'Job Role': 'Hospitality and Tourism',
'Employment Type': 'Unspecified',
'Monthly Salary Range': 'Unspecified',
'Number of Vacancies': 'Unspecified',
'Career Level': 'Mid Career',
'Years of Experience': 'Min: 7',
'Residence Location': 'Qatar',
'Degree': "Bachelor's degree / higher diploma"
}
ValueError: 15 columns passed, passed data had 14 columns
What I read here means that you designated the dataframe to have 15 columns, but the data that you feed it only has 14 features. You need to check your original file to make sure it actually has the data you expect, or adjust your expected columns and their names to match the file.
Let's clean up some of this code.
You don't need to write a function to remove columns, there's already a method to do that with .drop(). So delete the function remove_unwanted_cols(dataset, cols) and simply change the line:
df = remove_unwanted_cols(df, ["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"])
to
df = df.drop(["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"], axis=1)
Do you mean to have it send email twice? You have it do it in the scrape_website() finction and the convert_to_dataFrame() function.
If pulling data to construct a dataframe, I usually try to avoid lists, for the exact reason you get the error of some sites have x columns, but the next scrape has and extra one (or not a match in len). Dictionaries are a better way to handle that with the key being the column name, and value the data. So you'll have a list of dictionaries. Each item in the list is a row, and each dictionary corresponds to the value for a column. Then you can get rid of the convert_to_dataFrame() function as pandas can do that for you, but we'll leave that in there and you can keep it, or remove it, if you like.
If you are using r'' for your strings, you don't need to character escape the \. Either do: r"F:\AIenv\web_scrapping\jobDesc.xlsx", or "F:\\AIenv\web_scrapping\\jobDesc.xlsx"
Code:
import time
from datetime import date
import smtplib
import requests
import pandas as pd
from bs4 import BeautifulSoup
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
from email.utils import formatdate
def scrape_website(url):
soup = BeautifulSoup(requests.get(url).content, "lxml")
subject = url.split('/')
subject = [x for x in subject if x != ''][-1]
links = []
for a in soup.select("h2.m0.t-regular a"):
if a['href'] not in links:
links.append("https://www.bayt.com" + a['href'])
joineddd = []
for link in links:
row = {}
s = BeautifulSoup(requests.get(link).content, "lxml")
job_description = s.find('h2', text='Job Description').find_next('dl')
data_titles = job_description.find_all('dt')
for data_title in data_titles:
dt = '_'.join(data_title.text.split())
dd = data_title.find_next('dd').text.strip()
row.update({dt: dd})
if s.find('h2', text='Preferred Candidate'):
preferred_candidate = s.find('h2', text='Preferred Candidate').find_next('dl')
data_titles = preferred_candidate.find_all('dt')
for data_title in data_titles:
dt = '_'.join(data_title.text.split())
dd = data_title.find_next('dd').text.strip()
row.update({dt: dd})
joineddd.append(row)
print("Web Crawling is Done for {}".format(url))
convert_to_dataFrame(joineddd, subject)
#send_email(subject) #<-- did you want to send here?
def convert_to_dataFrame(joineddd, subject):
df = pd.DataFrame(joineddd)
df = df.drop(["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"], axis=1)
df_to_excel = df.to_excel(r"F:\\AIenv\web_scrapping\\jobDesc.xlsx", index = False, header=True)
send_email(subject) #<--or do you want to send here??
def send_email(subject):
'''send email '''
today = date.today()
file = 'F:\\AIenv\web_scrapping\\jobDesc.xlsx'
username='XXXXXXXXXXX'
password='XXXXXXXXXXXXX'
send_from = 'XXXXXXXXXXXXX'
send_to = 'XXXXXXXXXXXXXX'
Cc = 'recipient'
msg = MIMEMultipart()
msg['From'] = send_from
msg['To'] = send_to
msg['Cc'] = Cc
msg['Date'] = formatdate(localtime=True)
msg['Subject'] = 'Hello, This is a test mail {} - {}'.format(today,subject)
server = smtplib.SMTP('smtp.gmail.com')
port = '587'
fp = open(file, 'rb')
part = MIMEBase('application', 'vnd.ms-excel')
part.set_payload(fp.read())
fp.close()
encoders.encode_base64(part)
part.add_header('Content-Disposition', 'attachment', filename='jobs Description--{}'.format(today))
msg.attach(part)
smtp = smtplib.SMTP('smtp.gmail.com')
smtp.ehlo()
smtp.starttls()
smtp.login(username, password)
smtp.sendmail(send_from, send_to.split(',') + msg['Cc'].split(','), msg.as_string())
smtp.quit()
print('Mail Sent')
url_list = ["https://www.bayt.com/en/international/jobs/executive-chef-jobs/",
"https://www.bayt.com/en/international/jobs/head-chef-jobs/",
"https://www.bayt.com/en/international/jobs/executive-sous-chef-jobs/"]
if __name__ == "__main__":
for url in url_list:
scrape_website(url)

How to Fix AttributeError: module 'requests.sessions' has no attribute 'post'

I want to create account checker with python
These is my code
import requests
from bs4 import BeautifulSoup
class output(object):
def tested(self, email, password):
print(f" Posted: {email} {password}")
# Get CSRF TOKEN
class checker_start(object):
def get_token(self):
data = requests.get("https://www.marlboro.id/auth/login")
soup = BeautifulSoup(data.text, "lxml")
decide_csrf = soup.find("input", {"name": "decide_csrf"})["value"]
return decide_csrf
def post(self, combo):
decide_csrf = self.get_token()
email = combo[0]
password = combo [1]
api = requests.sessions
# Creating POST METHOD #
param = {"_method": "POST", "decide_csrf": decide_csrf, "email": email, "password": password}
source = api.post("https://www.marlboro.id/auth/login", data=param).text
if """<div class="err__msg-big">Opps!</div>""" in source:
output().tested(email, password)
else:
output().tested(email, password)
def start(self):
file = input("Please input your txt file: ")
data = open(file, "r").readlines()
data_s = [lines.replace("\n", " ") for lines in data]
for lines in data_s:
combo = lines.split("|")
self.post(combo)
if __name__ == "__main__":
checker_start().start()
But when I run these code the output is:
Traceback (most recent call last): File
"/Users/erzajullian/PycharmProjects/Checker/marlboro.py", line 39, in
checker_start().start() File "/Users/erzajullian/PycharmProjects/Checker/marlboro.py", line 36, in
start
self.post(combo) File "/Users/erzajullian/PycharmProjects/Checker/marlboro.py", line 23, in
post
source = api.post("https://www.marlboro.id/auth/login", data=param).text AttributeError: module 'requests.sessions' has no
attribute 'post'
How to solve these problem? Thank you
You have
api = requests.sessions
api is not the requests module after this point, so it has no post() method.
If you replace
source = api.post("https://www.marlboro.id/auth/login", data=param).text
with
source = requests.post("https://www.marlboro.id/auth/login", data=param).text
Does that fix things? If so, you can remove the api = requests.sessions line.)
First of all you'll have to change your code a bit. Post request should be handeled differently.
import requests
with requests.Sessions as s:
# Define headers which you will get via post request
Headers = {'user-agent':'YOUR_HEADERS'}
url = 'some_url'
login_data{
'username':'user',
'password':'password'
# Any other login data your site requires}
r = s.post(url, data=login_data, headers=Headers)
# now you can use r.content to parse given html in bs4
hope this helps

How to get the URLs of the most recent posts of a Instagram user? (with Python)

I want to get the URLs of the most recent posts of an Instagram user (not me, and I don't have an IG account so I can't use the API). The URLs should be in the style of https://www.instagram.com/p/BpnlsmWgqon/
I've tried making a request with response = requests.get(profile_url) and then parsing the HTML with soup = BeautifulSoup(html, 'html.parser').
After these and some other functions I get a big JSON file with data of the most recent pics (but not their URLs).
How can I get the URLs and extract just that?
Edit: This is what I've coded now. It's a mess, I've trying many approaches but none has worked.
#from subprocess import call
#from instagram.client import InstagramAPI
import requests
import json
from bs4 import BeautifulSoup
#from InstagramAPI.InstagramAPI import InstagramAPI
from instagram.client import InstagramAPI
from config import login, password
userid = "6194091573"
#url = "https://www.instagram.com/mercadona.novedades/?__a=1"
#pic_url =
#call('instalooter user mercadona.novedades ./pics -n 2')
#r = requests.get("https://www.instagram.com/mercadona.novedades")
#print(r.text)
def request_pic_url(profile_url):
response = requests.get(profile_url)
return response.text
def extract_json(html):
soup = BeautifulSoup(html, 'html.parser')
body = soup.find('body')
script_tag = body.find('script')
raw_string = script_tag.text.strip().replace('window._sharedData =', '').replace(';', '')
return json.loads(raw_string)
def get_recent_pics(profile_url):
results = []
response = request_pic_url(profile_url)
json_data = extract_json(response)
metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']["edges"]
for node in metrics:
node = node.get('node')
if node and isinstance(node, dict):
results.append(node)
return results
def api_thing():
api = InstagramAPI(login, password)
recent_media, next_ = api.user_recent_media(userid, 2)
for media in recent_media:
print(media.caption.text)
def main():
userid = "6194091573"
api_thing()
if __name__ == "__main__":
main()
def get_large_pic(url):
return url + "/media/?size=l"
def get_media_id(url):
req = requests.get('https://api.instagram.com/oembed/?url={}'.format(url))
media_id = req.json()['media_id']
return media_id
i suggest you to use the following library:https://github.com/LevPasha/Instagram-API-python
api = InstagramAPI("username", "password")
api.login()
def get_lastposts(us_id):
api.getUserFeed(us_id)
if 'items' in api.LastJson:
info = api.LastJson['items']
posts=[]
for media in info:
if (media['caption']!=None):
#print(media['caption']['media_id'])
posts.append(media['caption']['media_id'])
return posts
get_lastposts('user_id')

Python IMAPLIB HTML body parsing

so basically i have this script that runs continuously and when a new email arrives in the inbox with specific text in the subject, it grabs information from the email. I have only managed to get it to pull the subject from the email but I cant get it do get the body of the email no matter what I try, I believe the email body is in HTML so i attempted to use BeautifulSoup to parse the body but that doesnt work at all. Please help!!! :( Here is what i have so far:
import email
import imaplib
from bs4 import BeautifulSoup
import time
import sys
username = 'xxx.xxx#xxx.xx'
password = 'xxxxxx'
mail = imaplib.IMAP4_SSL('imap-mail.outlook.com')
(retcode, capabilities) = mail.login(username, password)
mail.list()
n=0
while True:
mail.select('inbox')
(retcode, messages) = mail.search(None, 'UNSEEN', '(SUBJECT "xxxxxxx-
")', '(FROM "xx.xx#xxxx.xx")')
if retcode == 'OK':
for num in messages[0].split():
n=n+1
print('Processing Email ' + str(n))
typ, data = mail.fetch(num, '(RFC822)')
for response_part in data:
if isinstance(response_part, tuple):
original = email.message_from_bytes(response_part[1])
print("Subject: " + original['Subject'])
typ, data = mail.store(num,'+FLAGS','\\Seen')
time.sleep(120)
Comment: The "body" returned by imap.fetch are usually bytes, not a string, which throws an exception
Change to:
msg = email.message_from_bytes(body)
Question: I cant get it do get the body of the email
For example:
import email, imaplib
username = 'xxx.xxx#xxx.xx'
password = 'xxxxxx'
imap = imaplib.IMAP4_SSL('imap-mail.outlook.com')
imap.login(username, password)
imap.select("inbox")
resp, items = imap.search(None, "(UNSEEN)")
for n, num in enumerate(items[0].split(), 1):
resp, data = imap.fetch(num, '(RFC822)')
body = data[0][1]
msg = email.message_from_string(body)
content = msg.get_payload(decode=True)
print("Message content[{}]:{}".format(n, content))

Data is sent to the form python script, get a code 200, but dont take a letter (result of work view)

I can't send in the form on the website data through the python script. The usual feedback form, which works properly when manually filling, but does not send anything if you send a request script. What to do?
The script sending a request to the site:
import requests
import sys
URL = 'http://127.0.0.1:8000/'
client = requests.session()
client.get(URL)
csrftoken = client.cookies['csrftoken']
login_data = dict(lastname='Игин', name='Anton', middlename='Konst', birthday='2017-04-20', telephone='(896) 097-29-02', csrfmiddlewaretoken=csrftoken, next='form_call/')
r = client.post(URL, data=login_data, headers=dict(Referer=URL))
views.py
def form_call(request):
if request.method=='POST':
form = Call_Form(request.POST)
name = request.POST.get('name', '')
lastname = request.POST.get('lastname', '')
middlename = request.POST.get('middlename', '')
birthday = request.POST.get('birthday', '')
telephone = request.POST.get('telephone', '')
if form.is_valid():
mail_host = SMTPMail.objects.all()[0]
rec_list = RecMail.objects.all()
recipients= []
for mail in rec_list:
recipients.append(mail.mail) #Список получателей
message = '''
На сайте вашей структуры NL International появилась новая заявка на звонок! Вот данные, предоставленные новым консультантом:
ФИО:{0} {1} {2}
Дата рождения: {3}
Телефон: {4}'''.format(name,lastname, middlename, birthday, telephone)
subject= 'Заявка на звонок'
send_mail(subject, message, mail_host.mail, recipients, fail_silently=False)
return redirect('/thanks/')
else:
return redirect('/error/')
I am already solved my problem. There is my code:
import requests
import sys
URL = 'http://127.0.0.1:8000/' #address of web page with html form
URL2 = 'http://127.0.0.1:8000/form_call/' #address of view processing form data
client = requests.session()
client.get(URL)
csrftoken = client.cookies['csrftoken']
login_data = dict(lastname='Игин', name='Anton', middlename='Konst testim', birthday='2017-01-20', telephone='896-002-00-02', csrfmiddlewaretoken=csrftoken)
r = client.post(URL2, data=login_data)

Categories

Resources