20_html_tests.cf   [plain text]


# SpamAssassin rules file: HTML tests
#
# Please don't modify this file as your changes will be overwritten with
# the next update. Use @@LOCAL_RULES_DIR@@/local.cf instead.
# See 'perldoc Mail::SpamAssassin::Conf' for details.
#
# <@LICENSE>
# Copyright 2004 Apache Software Foundation
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
#
###########################################################################

require_version @@VERSION@@

# HTML parser tests
#
# please sort these by eval type then name

# HTML control test, HTML spam rules should all have better S/O than this
body HTML_MESSAGE		eval:html_test('html_message')
describe HTML_MESSAGE		HTML included in message

# the HTML percentage range
# should really be converted into a numeric function test
body HTML_00_10			eval:html_range('ratio','0.00','0.10')
body HTML_10_20			eval:html_range('ratio','0.10','0.20')
body HTML_20_30			eval:html_range('ratio','0.20','0.30')
body HTML_30_40			eval:html_range('ratio','0.30','0.40')
body HTML_40_50			eval:html_range('ratio','0.40','0.50')
body HTML_50_60			eval:html_range('ratio','0.50','0.60')
body HTML_60_70			eval:html_range('ratio','0.60','0.70')
body HTML_70_80			eval:html_range('ratio','0.70','0.80')
body HTML_80_90			eval:html_range('ratio','0.80','0.90')
body HTML_90_100		eval:html_range('ratio','0.90','1.00')
describe HTML_00_10		Message is 0% to 10% HTML
describe HTML_10_20		Message is 10% to 20% HTML
describe HTML_20_30		Message is 20% to 30% HTML
describe HTML_30_40		Message is 30% to 40% HTML
describe HTML_40_50		Message is 40% to 50% HTML
describe HTML_50_60		Message is 50% to 60% HTML
describe HTML_60_70		Message is 60% to 70% HTML
describe HTML_70_80		Message is 70% to 80% HTML
describe HTML_80_90		Message is 80% to 90% HTML
describe HTML_90_100		Message is 90% to 100% HTML

# HTML shouting range
# should really be converted into a numeric function test
body HTML_SHOUTING3		eval:html_range('max_shouting','2','3')
body HTML_SHOUTING4		eval:html_range('max_shouting','3','4')
body HTML_SHOUTING5		eval:html_range('max_shouting','4','5')
body HTML_SHOUTING6		eval:html_range('max_shouting','5','6')
body HTML_SHOUTING7		eval:html_range('max_shouting','6','7')
describe HTML_SHOUTING3		HTML has very strong "shouting" markup
describe HTML_SHOUTING4		HTML has very strong "shouting" markup
describe HTML_SHOUTING5		HTML has very strong "shouting" markup
describe HTML_SHOUTING6		HTML has very strong "shouting" markup
describe HTML_SHOUTING7		HTML has very strong "shouting" markup

body HTML_TEXT_AFTER_HTML	eval:html_test('text_after_html')
describe HTML_TEXT_AFTER_HTML	HTML contains text after HTML close tag

body HTML_TEXT_AFTER_BODY	eval:html_test('text_after_body')
describe HTML_TEXT_AFTER_BODY	HTML contains text after BODY close tag

# HTML comment tests
body HTML_COMMENT_SHORT		eval:html_text_match('comment', '<!(?!-).{0,6}>')
describe HTML_COMMENT_SHORT	HTML comment is very short

body HTML_COMMENT_SAVED_URL	eval:html_text_match('comment', '<!-- saved from url=\(\d{4}\)')
describe HTML_COMMENT_SAVED_URL	HTML message is a saved web page

# Comment is a spam sign when following <DIV>
body HTML_CONVERTED		eval:html_test('div_converted')
describe HTML_CONVERTED		HTML conversion tool used by spam

body HTML_EMBEDS		eval:html_test('embeds')
describe HTML_EMBEDS		HTML with embedded plugin object

body HTML_EVENT_UNSAFE		eval:html_test('html_event_unsafe')
describe HTML_EVENT_UNSAFE	HTML contains unsafe auto-executing code

body HTML_FONT_SIZE_TINY	eval:html_eval('min_size', '< 1')
describe HTML_FONT_SIZE_TINY	HTML font size is tiny

body HTML_FONT_SIZE_NONE	eval:html_eval('min_size', '< 0')
describe HTML_FONT_SIZE_NONE	HTML font size is negative

body HTML_FONT_SIZE_LARGE	eval:html_range('max_size', '5', '6')
describe HTML_FONT_SIZE_LARGE	HTML font size is large

body HTML_FONT_SIZE_HUGE	eval:html_range('max_size', '6', 'inf')
describe HTML_FONT_SIZE_HUGE	HTML font size is huge

body HTML_FONT_BIG		eval:html_test('big_font')
describe HTML_FONT_BIG		HTML tag for a big font size

body HTML_FONT_TINY		eval:html_test('tiny_font')
describe HTML_FONT_TINY		HTML tag for a tiny font size

body HTML_FONT_INVISIBLE	eval:html_test('font_invisible')
describe HTML_FONT_INVISIBLE	HTML font color is same as background

body HTML_FONT_LOW_CONTRAST	eval:html_test('font_low_contrast')
describe HTML_FONT_LOW_CONTRAST	HTML font color similar to background

body HTML_FONT_FACE_BAD		eval:html_test('font_face_bad')
describe HTML_FONT_FACE_BAD	HTML font face is not a word

body HTML_FONT_FACE_CAPS	eval:html_test('font_face_caps')
describe HTML_FONT_FACE_CAPS	HTML font face has excess capital characters

body HTML_FORMACTION_MAILTO	eval:html_test('form_action_mailto')
describe HTML_FORMACTION_MAILTO	HTML includes a form which sends mail

# HTML_IMAGE_ONLY - not much raw HTML with images (absolute)
body HTML_IMAGE_ONLY_04		eval:html_image_only('0000','0400')
body HTML_IMAGE_ONLY_08		eval:html_image_only('0400','0800')
body HTML_IMAGE_ONLY_12		eval:html_image_only('0800','1200')
body HTML_IMAGE_ONLY_16		eval:html_image_only('1200','1600')
body HTML_IMAGE_ONLY_20		eval:html_image_only('1600','2000')
body HTML_IMAGE_ONLY_24		eval:html_image_only('2000','2400')
describe HTML_IMAGE_ONLY_04	HTML: images with 0-400 bytes of words
describe HTML_IMAGE_ONLY_08	HTML: images with 400-800 bytes of words
describe HTML_IMAGE_ONLY_12	HTML: images with 800-1200 bytes of words
describe HTML_IMAGE_ONLY_16	HTML: images with 1200-1600 bytes of words
describe HTML_IMAGE_ONLY_20	HTML: images with 1600-2000 bytes of words
describe HTML_IMAGE_ONLY_24	HTML: images with 2000-2400 bytes of words

# HTML_IMAGE_RATIO - more image area than text (ratio)
body HTML_IMAGE_RATIO_02	eval:html_image_ratio('0.000','0.002')
body HTML_IMAGE_RATIO_04	eval:html_image_ratio('0.002','0.004')
body HTML_IMAGE_RATIO_06	eval:html_image_ratio('0.004','0.006')
body HTML_IMAGE_RATIO_08	eval:html_image_ratio('0.006','0.008')
describe HTML_IMAGE_RATIO_02	HTML has a low ratio of text to image area
describe HTML_IMAGE_RATIO_04	HTML has a low ratio of text to image area
describe HTML_IMAGE_RATIO_06	HTML has a low ratio of text to image area
describe HTML_IMAGE_RATIO_08	HTML has a low ratio of text to image area

body HTML_LINK_PUSH_HERE	eval:html_text_match('anchor', '(?i)(?:push|go)\s*(?:here|this)')
describe HTML_LINK_PUSH_HERE	HTML link text says "push here" or similar

# HTML obfuscation
body HTML_OBFUSCATE_05_10	eval:html_range('obfuscation_ratio','.05','.1')
body HTML_OBFUSCATE_10_20	eval:html_range('obfuscation_ratio','.1','.2')
body HTML_OBFUSCATE_20_30	eval:html_range('obfuscation_ratio','.2','.3')
body HTML_OBFUSCATE_30_40	eval:html_range('obfuscation_ratio','.3','.4')
body HTML_OBFUSCATE_40_50	eval:html_range('obfuscation_ratio','.4','.5')
body HTML_OBFUSCATE_50_60	eval:html_range('obfuscation_ratio','.5','.6')
body HTML_OBFUSCATE_60_70	eval:html_range('obfuscation_ratio','.6','.7')
body HTML_OBFUSCATE_70_80	eval:html_range('obfuscation_ratio','.7','.8')
body HTML_OBFUSCATE_80_90	eval:html_range('obfuscation_ratio','.8','.9')
body HTML_OBFUSCATE_90_100	eval:html_range('obfuscation_ratio','.9','1.0')
describe HTML_OBFUSCATE_05_10	Message is 5% to 10% HTML obfuscation
describe HTML_OBFUSCATE_10_20	Message is 10% to 20% HTML obfuscation
describe HTML_OBFUSCATE_20_30	Message is 20% to 30% HTML obfuscation
describe HTML_OBFUSCATE_30_40	Message is 30% to 40% HTML obfuscation
describe HTML_OBFUSCATE_40_50	Message is 40% to 50% HTML obfuscation
describe HTML_OBFUSCATE_50_60	Message is 50% to 60% HTML obfuscation
describe HTML_OBFUSCATE_60_70	Message is 60% to 70% HTML obfuscation
describe HTML_OBFUSCATE_70_80	Message is 70% to 80% HTML obfuscation
describe HTML_OBFUSCATE_80_90	Message is 80% to 90% HTML obfuscation
describe HTML_OBFUSCATE_90_100	Message is 90% to 100% HTML obfuscation

# backhair - idea from backhair set by Jennifer Wheeler and Adam Lopresto.
body HTML_BACKHAIR_2		eval:html_range('backhair_count', '1', '4')
body HTML_BACKHAIR_4		eval:html_range('backhair_count', '4', '8')
body HTML_BACKHAIR_8		eval:html_range('backhair_count', '8', 'inf')
describe HTML_BACKHAIR_2	HTML tags used to obfuscate words
describe HTML_BACKHAIR_4	HTML tags used to obfuscate words
describe HTML_BACKHAIR_8	HTML tags used to obfuscate words

# HTML attribute testing
body HTML_ATTR_BAD		eval:html_range('attr_bad','0.75','1.0')
describe HTML_ATTR_BAD		HTML has many bad attributes in tags
body HTML_ATTR_UNIQUE		eval:html_range('attr_unique_bad','0.5','1.0')
describe HTML_ATTR_UNIQUE	HTML appears to have random attributes in tags

body HTML_WEB_BUGS		eval:html_test('web_bugs')
describe HTML_WEB_BUGS		Image tag intended to identify you

body HTML_TAG_BALANCE_BODY	eval:html_tag_balance('body', '!= 0')
describe HTML_TAG_BALANCE_BODY	HTML has unbalanced "body" tags

body HTML_TAG_BALANCE_HEAD	eval:html_tag_balance('head', '!= 0')
describe HTML_TAG_BALANCE_HEAD	HTML has unbalanced "head" tags

body HTML_TAG_EXIST_MARQUEE	eval:html_tag_exists('marquee')
describe HTML_TAG_EXIST_MARQUEE	HTML has "marquee" tag

body HTML_TAG_EXIST_TBODY	eval:html_tag_exists('tbody')
describe HTML_TAG_EXIST_TBODY	HTML has "tbody" tag

# percentage of tags that are not legal elements in HTML
body HTML_BADTAG_00_10	eval:html_range('bad_tag_ratio','0.00','0.10')
body HTML_BADTAG_10_20	eval:html_range('bad_tag_ratio','0.10','0.20')
body HTML_BADTAG_20_30	eval:html_range('bad_tag_ratio','0.20','0.30')
body HTML_BADTAG_30_40	eval:html_range('bad_tag_ratio','0.30','0.40')
body HTML_BADTAG_40_50	eval:html_range('bad_tag_ratio','0.40','0.50')
body HTML_BADTAG_50_60	eval:html_range('bad_tag_ratio','0.50','0.60')
body HTML_BADTAG_60_70	eval:html_range('bad_tag_ratio','0.60','0.70')
body HTML_BADTAG_70_80	eval:html_range('bad_tag_ratio','0.70','0.80')
body HTML_BADTAG_80_90	eval:html_range('bad_tag_ratio','0.80','0.90')
body HTML_BADTAG_90_100	eval:html_range('bad_tag_ratio','0.90','1.00')
describe HTML_BADTAG_00_10	HTML message is 0% to 10% bad tags
describe HTML_BADTAG_10_20	HTML message is 10% to 20% bad tags
describe HTML_BADTAG_20_30	HTML message is 20% to 30% bad tags
describe HTML_BADTAG_30_40	HTML message is 30% to 40% bad tags
describe HTML_BADTAG_40_50	HTML message is 40% to 50% bad tags
describe HTML_BADTAG_50_60	HTML message is 50% to 60% bad tags
describe HTML_BADTAG_60_70	HTML message is 60% to 70% bad tags
describe HTML_BADTAG_70_80	HTML message is 70% to 80% bad tags
describe HTML_BADTAG_80_90	HTML message is 80% to 90% bad tags
describe HTML_BADTAG_90_100	HTML message is 90% to 100% bad tags

# percentage of unique non-elements in HTML
body HTML_NONELEMENT_00_10	eval:html_range('non_element_ratio','0.00','0.10')
body HTML_NONELEMENT_10_20	eval:html_range('non_element_ratio','0.10','0.20')
body HTML_NONELEMENT_20_30	eval:html_range('non_element_ratio','0.20','0.30')
body HTML_NONELEMENT_30_40	eval:html_range('non_element_ratio','0.30','0.40')
body HTML_NONELEMENT_40_50	eval:html_range('non_element_ratio','0.40','0.50')
body HTML_NONELEMENT_50_60	eval:html_range('non_element_ratio','0.50','0.60')
body HTML_NONELEMENT_60_70	eval:html_range('non_element_ratio','0.60','0.70')
body HTML_NONELEMENT_70_80	eval:html_range('non_element_ratio','0.70','0.80')
body HTML_NONELEMENT_80_90	eval:html_range('non_element_ratio','0.80','0.90')
body HTML_NONELEMENT_90_100	eval:html_range('non_element_ratio','0.90','1.00')
describe HTML_NONELEMENT_00_10	0% to 10% of HTML elements are non-standard
describe HTML_NONELEMENT_10_20	10% to 20% of HTML elements are non-standard
describe HTML_NONELEMENT_20_30	20% to 30% of HTML elements are non-standard
describe HTML_NONELEMENT_30_40	30% to 40% of HTML elements are non-standard
describe HTML_NONELEMENT_40_50	40% to 50% of HTML elements are non-standard
describe HTML_NONELEMENT_50_60	50% to 60% of HTML elements are non-standard
describe HTML_NONELEMENT_60_70	60% to 70% of HTML elements are non-standard
describe HTML_NONELEMENT_70_80	70% to 80% of HTML elements are non-standard
describe HTML_NONELEMENT_80_90	80% to 90% of HTML elements are non-standard
describe HTML_NONELEMENT_90_100	90% to 100% of HTML elements are non-standard

# short HTML messages with certain attributes
body HTML_SHORT_LENGTH		eval:html_eval('length', '< 170')
describe HTML_SHORT_LENGTH	HTML is extremely short

body __HTML_LENGTH_512		eval:html_eval('length', '< 512')
body __COMMENT_EXISTS		eval:html_text_match('comment', '<!.*?>')
meta HTML_SHORT_COMMENT		(__HTML_LENGTH_512 && __COMMENT_EXISTS)
describe HTML_SHORT_COMMENT	HTML is very short with HTML comments

body __HTML_LENGTH_384		eval:html_eval('length', '< 384')
body __TAG_EXISTS_CENTER	eval:html_tag_exists('center')
meta HTML_SHORT_CENTER		(__HTML_LENGTH_384 && __TAG_EXISTS_CENTER)
describe HTML_SHORT_CENTER	HTML is very short with CENTER tag

body HTML_TITLE_EMPTY		eval:html_text_not_match('title', '(?s)\S')
describe HTML_TITLE_EMPTY	HTML title contains no text

body HTML_TITLE_UNTITLED	eval:html_text_match('title', '(?i)(?:untitled|new page \d+)')
describe HTML_TITLE_UNTITLED	HTML title contains "Untitled"

###########################################################################
# meta tests

body __HTML_CHARSET_FARAWAY	eval:html_charset_faraway()
meta HTML_CHARSET_FARAWAY	(__HTML_CHARSET_FARAWAY && __HIGHBITS)
describe HTML_CHARSET_FARAWAY	A foreign language charset used in HTML markup
tflags HTML_CHARSET_FARAWAY	userconf

meta HTML_MIME_NO_HTML_TAG	MIME_HTML_ONLY && !__TAG_EXISTS_HTML
describe HTML_MIME_NO_HTML_TAG	HTML-only message, but there is no HTML tag

meta HTML_MISSING_CTYPE		(!__MIME_HTML && HTML_MESSAGE)
describe HTML_MISSING_CTYPE	Message is HTML without HTML Content-Type

###########################################################################
# rawbody HTML tests

rawbody HIDE_WIN_STATUS		/<[^>]+onMouseOver=[^>]+window\.status=/i
describe HIDE_WIN_STATUS	Javascript to hide URLs in browser

rawbody __OBFUSCATING_COMMENT_A	/\w(?:<![^>]*>)+\w/
rawbody __OBFUSCATING_COMMENT_B	/[^\s>](?:<![^>]*>)+[^\s<]/
meta OBFUSCATING_COMMENT	((__OBFUSCATING_COMMENT_A && HTML_MESSAGE) || (__OBFUSCATING_COMMENT_B && MIME_HTML_ONLY))
describe OBFUSCATING_COMMENT	HTML comments which obfuscate text

# spams that are assembled from a Javascript array
# look for the XOR op
rawbody __JS_FROMCHARCODE       /String\.fromCharCode\s*\(\s*\S+\s*\[\s*\S+\s*\]\s*\^/
rawbody __JS_DOCWRITE           /document\.write/
meta JS_FROMCHARCODE            (__JS_FROMCHARCODE && __JS_DOCWRITE)
describe JS_FROMCHARCODE        Document is built from a Javascript charcode array

# A-Z, a-z, 0-9
rawbody ENTITY_DEC_ALPHANUM	/\&\#0*(?:4[89]|5[0-7]|6[5-9][78]\d|9[0789]|1[01]\d|12[012])\;/
describe ENTITY_DEC_ALPHANUM	HTML contains needlessly encoded characters

# ! $ % ' ( ) , - . / : ; = ? @ _
# a good possible rule that may resurface
#rawbody ENTITY_DEC_OTHER	/\&\#0*(?:3[3679]|4[014567]|5[89]|6[134]|95)\;/
#describe ENTITY_DEC_OTHER	HTML contains needlessly encoded punctuation