Extraction of Emails & Header Information from Outlook in Order to Detect Phishing Attacks

A practical demonstration of phishing detection
from imaplib import IMAP4_SSL
import email as em
from email.utils import parsedate, parsedate_tz
from email.parser import HeaderParser
import numpy as np
import pandas as pd
import getpass
from datetime import timedelta, datetime, date
#Functions
class OutlookAccount(object):
def __init__(self, username=None, password=None, folder=None):
self.username = username
self.password = password
self.folder = folder
def login(self):
self.conn = IMAP4_SSL('outlook.office365.com')
response = self.conn.login(self.username, self.password)
return response
def search(self, query, folder=None, readonly=False):
ff = self.folder if self.folder else folder
self.conn.select(ff, readonly)
resp, data = self.conn.search(None, query)
return data
def fetch(self, uids, query):
uid_arr = b','.join(uids[0].split())
resp, data = self.conn.fetch(uid_arr, query)
return data
def fetch_and_parse(self, uids, query):
data = self.fetch(uids, query)
parser = HeaderParser()
emails = []
for email in data:
if len(email) < 2:
continue
msg = em.message_from_bytes(email[1]).as_string()
emails.append(parser.parsestr(msg))return emailsdef load_parse_query(self, search_query, fetch_query, folder=None, readonly=False):
uids = self.search(search_query, folder, readonly)
return self.fetch_and_parse(uids, fetch_query)
#User Email Credentials
imap_password = 'YourEmailPassword'
imap_username = 'yourOutlookMail@outlook.com'

outlook = OutlookAccount(username=imap_username, password=imap_password)
outlook.login()
daysback = 6000
notsince = 0
since = (date.today() - timedelta(daysback)).strftime("%d-%b-%Y")
before = (date.today() - timedelta(notsince)).strftime("%d-%b-%Y")
SEARCH = '(SENTSINCE {si} SENTBEFORE {bf})'.format(si=since, bf=before)
ALL_HEADERS = '(BODY.PEEK[HEADER])'
# Search and fetch emails!
received = outlook.load_parse_query(search_query=SEARCH, fetch_query=ALL_HEADERS, folder='"INBOX"')
#create function to convert to dataframe
def scrub_email(headers):
return dict([(title.lower(), value) for title, value in headers])
df = pd.DataFrame([scrub_email(email._headers) for email in received])

Approach-1

As part of the first attempt, I utilized the x-sender-id of the header to track down the primary individual who was behind the fake email that pretended to be the original user listed in the header’s from section. As can be seen in the code snippet below, SPF verification is also unsuccessful when an email is faked, in addition to the obvious difference in x-sender-id and from.

X-Sender-Id: tih5qno0ow|x-authuser|bilal@higee.net
From: "admin" <admin@company.com> #dummy value
received-SPF: Fail (protection.outlook.com: domain of company.com does not designate 23.83.xxx.x as permitted sender)
import re
mail_re = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
traversed_index = []fake_mail = df['x-sender-id'].values
sender = df['from'].values
for i,(m, n) in enumerate(zip(fake_mail,sender)):fk_email = re.findall(mail_re, str(m))
sdr_email = re.findall(mail_re, str(n))
if(len(fk_email)>0):
if(fk_email != sdr_email):
traversed_index.append(i)
domain = fk_email[0][fk_email[0].index('@') + 1 : ]
print("+ \t It could be Phishing")
x_mail_df = df.iloc[i]
a = x_mail_df['x-mailer']
if(len(a)>0):
print("Sender: "+a)
else:
print("Sender: "+domain)
print("Sent From: https://"+str(domain))
print("Original Sender: ",fk_email)
print("Pretender Sender: ",sdr_email,"\n\n")
+ 	 It could be Phishing
Sender: Gophish
Sent From: https://higee.net
Original Sender: ['bilal@higee.net']
Pretender Sender: ['admin@company.com'] #dummy value

Approach-2

Although I was able to successfully detect the phishing emails but in some experiments servers changed those header values, so my script was not working smoothly, I had to again check for the possible values that could help in sorting out the issue. Then I came across the message-id, that contained the information about the fake user. The code of this approach is added in code box.

import re
mail_re = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
traversed_index1 = []
ph_vals = df['message-id'].values
sender = df['from'].values
for i,(m, n) in enumerate(zip(ph_vals,sender)):

ph_email = re.findall(mail_re, str(m))
sdr_email = re.findall(mail_re, str(n))

if(len(ph_email)>0):
if(ph_email != sdr_email):
traversed_index1.append(i)
domain = ph_email[0][ph_email[0].index('@') + 1 : ]
print("Sent From: https://"+str(domain))
print("Original Sender: ",ph_email)
print("Pretender Sender: ",sdr_email,"\n\n")
Sent From: https://emkei.cz
Original Sender:235@emkei.cz
Pretender Sender:admin@company.com #dummy value

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Azhar Ghafoor

Azhar Ghafoor

Cybersecurity Researcher | Ethical Hacking | Data Analyst