Extraction of Emails & Header Information from Outlook in Order to Detect Phishing Attacks

A practical demonstration of phishing detection
from imaplib import IMAP4_SSL
import email as em
from email.utils import parsedate, parsedate_tz
from email.parser import HeaderParser
import numpy as np
import pandas as pd
import getpass
from datetime import timedelta, datetime, date
class OutlookAccount(object):
def __init__(self, username=None, password=None, folder=None):
self.username = username
self.password = password
self.folder = folder
def login(self):
self.conn = IMAP4_SSL('outlook.office365.com')
response = self.conn.login(self.username, self.password)
return response
def search(self, query, folder=None, readonly=False):
ff = self.folder if self.folder else folder
self.conn.select(ff, readonly)
resp, data = self.conn.search(None, query)
return data
def fetch(self, uids, query):
uid_arr = b','.join(uids[0].split())
resp, data = self.conn.fetch(uid_arr, query)
return data
def fetch_and_parse(self, uids, query):
data = self.fetch(uids, query)
parser = HeaderParser()
emails = []
for email in data:
if len(email) < 2:
msg = em.message_from_bytes(email[1]).as_string()
emails.append(parser.parsestr(msg))return emailsdef load_parse_query(self, search_query, fetch_query, folder=None, readonly=False):
uids = self.search(search_query, folder, readonly)
return self.fetch_and_parse(uids, fetch_query)
#User Email Credentials
imap_password = 'YourEmailPassword'
imap_username = 'yourOutlookMail@outlook.com'

outlook = OutlookAccount(username=imap_username, password=imap_password)
daysback = 6000
notsince = 0
since = (date.today() - timedelta(daysback)).strftime("%d-%b-%Y")
before = (date.today() - timedelta(notsince)).strftime("%d-%b-%Y")
SEARCH = '(SENTSINCE {si} SENTBEFORE {bf})'.format(si=since, bf=before)
# Search and fetch emails!
received = outlook.load_parse_query(search_query=SEARCH, fetch_query=ALL_HEADERS, folder='"INBOX"')
#create function to convert to dataframe
def scrub_email(headers):
return dict([(title.lower(), value) for title, value in headers])
df = pd.DataFrame([scrub_email(email._headers) for email in received])


As part of the first attempt, I utilized the x-sender-id of the header to track down the primary individual who was behind the fake email that pretended to be the original user listed in the header’s from section. As can be seen in the code snippet below, SPF verification is also unsuccessful when an email is faked, in addition to the obvious difference in x-sender-id and from.

X-Sender-Id: tih5qno0ow|x-authuser|bilal@higee.net
From: "admin" <admin@company.com> #dummy value
received-SPF: Fail (protection.outlook.com: domain of company.com does not designate 23.83.xxx.x as permitted sender)
import re
mail_re = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
traversed_index = []fake_mail = df['x-sender-id'].values
sender = df['from'].values
for i,(m, n) in enumerate(zip(fake_mail,sender)):fk_email = re.findall(mail_re, str(m))
sdr_email = re.findall(mail_re, str(n))
if(fk_email != sdr_email):
domain = fk_email[0][fk_email[0].index('@') + 1 : ]
print("+ \t It could be Phishing")
x_mail_df = df.iloc[i]
a = x_mail_df['x-mailer']
print("Sender: "+a)
print("Sender: "+domain)
print("Sent From: https://"+str(domain))
print("Original Sender: ",fk_email)
print("Pretender Sender: ",sdr_email,"\n\n")
+ 	 It could be Phishing
Sender: Gophish
Sent From: https://higee.net
Original Sender: ['bilal@higee.net']
Pretender Sender: ['admin@company.com'] #dummy value


Although I was able to successfully detect the phishing emails but in some experiments servers changed those header values, so my script was not working smoothly, I had to again check for the possible values that could help in sorting out the issue. Then I came across the message-id, that contained the information about the fake user. The code of this approach is added in code box.

import re
mail_re = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
traversed_index1 = []
ph_vals = df['message-id'].values
sender = df['from'].values
for i,(m, n) in enumerate(zip(ph_vals,sender)):

ph_email = re.findall(mail_re, str(m))
sdr_email = re.findall(mail_re, str(n))

if(ph_email != sdr_email):
domain = ph_email[0][ph_email[0].index('@') + 1 : ]
print("Sent From: https://"+str(domain))
print("Original Sender: ",ph_email)
print("Pretender Sender: ",sdr_email,"\n\n")
Sent From: https://emkei.cz
Original Sender:235@emkei.cz
Pretender Sender:admin@company.com #dummy value



