diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index 1f309d8259..de3ea8826a 100644 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -76,7 +76,8 @@ "Rison Encode", "Rison Decode", "To Modhex", - "From Modhex" + "From Modhex", + "MIME Decoding" ] }, { diff --git a/src/core/operations/MIMEDecoding.mjs b/src/core/operations/MIMEDecoding.mjs new file mode 100644 index 0000000000..7b52fbdd25 --- /dev/null +++ b/src/core/operations/MIMEDecoding.mjs @@ -0,0 +1,171 @@ +/** + * @author mshwed [m@ttshwed.com] + * @copyright Crown Copyright 2019 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; +import OperationError from "../errors/OperationError.mjs"; +import Utils from "../Utils.mjs"; +import { fromHex } from "../lib/Hex.mjs"; +import { fromBase64 } from "../lib/Base64.mjs"; +import cptable from "codepage"; + +/** + * MIME Decoding operation + */ +class MIMEDecoding extends Operation { + + /** + * MIMEDecoding constructor + */ + constructor() { + super(); + + this.name = "MIME Decoding"; + this.module = "Default"; + this.description = "Enables the decoding of MIME message header extensions for non-ASCII text"; + this.infoURL = "https://tools.ietf.org/html/rfc2047"; + this.inputType = "byteArray"; + this.outputType = "string"; + this.args = []; + } + + /** + * @param {byteArray} input + * @param {Object[]} args + * @returns {string} + */ + run(input, args) { + const mimeEncodedText = Utils.byteArrayToUtf8(input); + const encodedHeaders = mimeEncodedText.replace(/\r\n/g, "\n"); + + const decodedHeader = this.decodeHeaders(encodedHeaders); + + return decodedHeader; + } + + /** + * Decode MIME header strings + * + * @param headerString + */ + decodeHeaders(headerString) { + // No encoded words detected + let i = headerString.indexOf("=?"); + if (i === -1) return headerString; + + let decodedHeaders = headerString.slice(0, i); + let header = headerString.slice(i); + + let isBetweenWords = false; + let start, cur, charset, encoding, j, end, text; + while (header.length > -1) { + start = header.indexOf("=?"); + if (start === -1) break; + cur = start + "=?".length; + + i = header.slice(cur).indexOf("?"); + if (i === -1) break; + + charset = header.slice(cur, cur + i); + cur += i + "?".length; + + if (header.length < cur + "Q??=".length) break; + + encoding = header[cur]; + cur += 1; + + if (header[cur] !== "?") break; + + cur += 1; + + j = header.slice(cur).indexOf("?="); + if (j === -1) break; + + text = header.slice(cur, cur + j); + end = cur + j + "?=".length; + + if (encoding.toLowerCase() === "b") { + text = fromBase64(text); + } else if (encoding.toLowerCase() === "q") { + text = this.parseQEncodedWord(text); + } else { + isBetweenWords = false; + decodedHeaders += header.slice(0, start + 2); + header = header.slice(start + 2); + } + + if (start > 0 && (!isBetweenWords || header.slice(0, start).search(/\S/g) > -1)) { + decodedHeaders += header.slice(0, start); + } + + decodedHeaders += this.convertFromCharset(charset, text); + + header = header.slice(end); + isBetweenWords = true; + } + + if (header.length > 0) { + decodedHeaders += header; + } + + return decodedHeaders; + } + + /** + * Converts decoded text for supported charsets. + * Supports UTF-8, US-ASCII, ISO-8859-* + * + * @param encodedWord + */ + convertFromCharset(charset, encodedText) { + charset = charset.toLowerCase(); + const parsedCharset = charset.split("-"); + + if (parsedCharset.length === 2 && parsedCharset[0] === "utf" && charset === "utf-8") { + return cptable.utils.decode(65001, encodedText); + } else if (parsedCharset.length === 2 && charset === "us-ascii") { + return cptable.utils.decode(20127, encodedText); + } else if (parsedCharset.length === 3 && parsedCharset[0] === "iso" && parsedCharset[1] === "8859") { + const isoCharset = parseInt(parsedCharset[2], 10); + if (isoCharset >= 1 && isoCharset <= 16) { + return cptable.utils.decode(28590 + isoCharset, encodedText); + } + } + + throw new OperationError("Unhandled Charset"); + } + + /** + * Parses a Q encoded word + * + * @param encodedWord + */ + parseQEncodedWord(encodedWord) { + let decodedWord = ""; + for (let i = 0; i < encodedWord.length; i++) { + if (encodedWord[i] === "_") { + decodedWord += " "; + // Parse hex encoding + } else if (encodedWord[i] === "=") { + if ((i + 2) >= encodedWord.length) throw new OperationError("Incorrectly Encoded Word"); + const decodedHex = Utils.byteArrayToChars(fromHex(encodedWord.substring(i + 1, i + 3))); + decodedWord += decodedHex; + i += 2; + } else if ( + (encodedWord[i].charCodeAt(0) >= " ".charCodeAt(0) && encodedWord[i].charCodeAt(0) <= "~".charCodeAt(0)) || + encodedWord[i] === "\n" || + encodedWord[i] === "\r" || + encodedWord[i] === "\t") { + decodedWord += encodedWord[i]; + } else { + throw new OperationError("Incorrectly Encoded Word"); + } + } + + return decodedWord; + } +} + +export default MIMEDecoding; diff --git a/tests/operations/index.mjs b/tests/operations/index.mjs index 991bd3565c..a82bc874c6 100644 --- a/tests/operations/index.mjs +++ b/tests/operations/index.mjs @@ -104,6 +104,7 @@ import "./tests/LZNT1Decompress.mjs"; import "./tests/LZString.mjs"; import "./tests/Magic.mjs"; import "./tests/Media.mjs"; +import "./tests/MIMEDecoding.mjs"; import "./tests/Modhex.mjs"; import "./tests/MorseCode.mjs"; import "./tests/MS.mjs"; diff --git a/tests/operations/tests/MIMEDecoding.mjs b/tests/operations/tests/MIMEDecoding.mjs new file mode 100644 index 0000000000..b99fc489e1 --- /dev/null +++ b/tests/operations/tests/MIMEDecoding.mjs @@ -0,0 +1,89 @@ +/** + * MIME Header Decoding tests + * + * @author mshwed [m@ttshwed.com] + * @copyright Crown Copyright 2019 + * @license Apache-2.0 + */ + +import TestRegister from "../../lib/TestRegister.mjs"; + +TestRegister.addTests([ + { + name: "Encoded comments", + input: "(=?ISO-8859-1?Q?a?=)", + expectedOutput: "(a)", + recipeConfig: [ + { + "op": "MIME Decoding", + "args": [] + } + ] + }, + { + name: "Encoded adjacent comments whitespace", + input: "(=?ISO-8859-1?Q?a?= b)", + expectedOutput: "(a b)", + recipeConfig: [ + { + "op": "MIME Decoding", + "args": [] + } + ] + }, + { + name: "Encoded adjacent single whitespace ignored", + input: "(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)", + expectedOutput: "(ab)", + recipeConfig: [ + { + "op": "MIME Decoding", + "args": [] + } + ] + }, + { + name: "Encoded adjacent double whitespace ignored", + input: "(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)", + expectedOutput: "(ab)", + recipeConfig: [ + { + "op": "MIME Decoding", + "args": [] + } + ] + }, + { + name: "Encoded adjacent CRLF whitespace ignored", + input: "(=?ISO-8859-1?Q?a?=\r\n =?ISO-8859-1?Q?b?=)", + expectedOutput: "(ab)", + recipeConfig: [ + { + "op": "MIME Decoding", + "args": [] + } + ] + }, + { + name: "UTF-8 Encodings Multiple Headers", + input: "=?utf-8?q?=C3=89ric?= , =?utf-8?q?Ana=C3=AFs?= ", + expectedOutput: "Éric , Anaïs ", + recipeConfig: [ + { + "op": "MIME Decoding", + "args": [] + } + ] + }, + { + name: "ISO Decoding", + input: "From: =?US-ASCII?Q?Keith_Moore?= \nTo: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= \nCC: =?ISO-8859-1?Q?Andr=E9?= Pirard \nSubject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=\n=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=", + expectedOutput: "From: Keith Moore \nTo: Keld Jørn Simonsen \nCC: André Pirard \nSubject: If you can read this you understand the example.", + recipeConfig: [ + { + "op": "MIME Decoding", + "args": [] + } + ] + } +]);