# Spam_detect function optional setting ## Parameter intro - accout detect (bool, optional, defaults to True) - adopt detect account & phone message, default not adopt - url_detect (str, optional, defaults to None) - None : disable detect - 're' : only detect url by regular expression (lower accuracy) - 'urlextract' : detect url by urlextract (higher accuracy) - language_classify (bool, optional, defaults to True) - adopt language classify, default adopt - rulebase_detect (bool, optional, defaults to True) - adopt rulebase detection, default adopt - model_predict (bool, optional, defaults to True) - adopt TextCNN model prediction, default adopt - pinyin_mode (bool, optional, defaults to False) - adopt pinyin fix module, default not adopt ## analysis ### Full function #### account_detect=True, url_detect='urlextract', language_classify=True, rulebase_detect=True, model_predict=True, pinyin_mode=True ```=python message = '加微SH9709200 价格:7元105万/ 40元1250万/' result = model.analysis(message, account_detect=True, url_detect='urlextract', language_classify=True, rulebase_detect=True, model_predict=True, pinyin_mode=True) print(result) --------------result-------------- { 'message': '加微SH9709200 价格:7元105万/ 40元1250万/', 'label_distribution': [0, 0, 0, 0, 0, 0, 0], 'ban': True, 'final_label': 5, 'url_count': 0, 'url_list': [], 'account_count': 1, 'account_list': ['SH9709200'], 'language': 'zh', 'predict_result': [], 'status': True } --------------in predetect.txt-------------- 加微SH9709200 价格:7元105万/ 40元1250万/ -> ["拼音: ([['jia4'], ['ge2']]), Label: 5"] ``` ### account_detect #### account_detect=True, url_detect=None, language_classify=False, rulebase_detect=False, model_predict=False, pinyin_mode=False ```=python message = '加微SH9709200 价格:7元105万/ 40元1250万/' result = model.analysis(message, account_detect=True, url_detect=None, language_classify=False, rulebase_detect=False, model_predict=False, pinyin_mode=False) print(result) --------------result-------------- { 'message': '加微SH9709200 价格:7元105万/ 40元1250万/', 'label_distribution': [0, 0, 0, 0, 0, 0, 0], 'ban': True, 'final_label': 5, 'account_count': 1, 'account_list': ['SH9709200'], 'status': True } ``` ### url_detect #### account_detect=False, url_detect='urlextract', language_classify=False, rulebase_detect=False, model_predict=False, pinyin_mode=False ```=python message = '加微SH9709200 价格:7元105万/ 40元1250万/' result = model.analysis(message, account_detect=False, url_detect='urlextract', language_classify=False, rulebase_detect=False, model_predict=False, pinyin_mode=False) print(result) --------------result-------------- { 'message': '加微SH9709200 价格:7元105万/ 40元1250万/', 'label_distribution': [0, 0, 0, 0, 0, 0, 0], 'ban': True, 'final_label': 5, 'url_count': 0, 'url_list': [], 'status': True } ``` ### language_classify #### account_detect=False, url_detect=None, language_classify=True, rulebase_detect=False, model_predict=False, pinyin_mode=False ```=python message = '加微SH9709200 价格:7元105万/ 40元1250万/' result = model.analysis(message, account_detect=False, url_detect=None, language_classify=True, rulebase_detect=False, model_predict=False, pinyin_mode=False) print(result) --------------result-------------- { 'message': '加微SH9709200 价格:7元105万/ 40元1250万/', 'label_distribution': [0, 0, 0, 0, 0, 0, 0], 'ban': True, 'final_label': 5, 'language': 'zh', 'status': True } ``` ### rulebase_detect #### account_detect=False, url_detect=None, language_classify=False, rulebase_detect=True, model_predict=False, pinyin_mode=False ```=python message = '加微SH9709200 价格:7元105万/ 40元1250万/' result = model.analysis(message, account_detect=False, url_detect=None, language_classify=False, rulebase_detect=True, model_predict=False, pinyin_mode=False) print(result) --------------result-------------- { 'message': '加微SH9709200 价格:7元105万/ 40元1250万/', 'label_distribution': [0, 0, 0, 0, 0, 0, 0], 'ban': True, 'final_label': 5, 'status': True } ``` ### model_predict #### account_detect=False, url_detect=None, language_classify=False, rulebase_detect=False, model_predict=True, pinyin_mode=False ```=python message = '加微SH9709200 价格:7元105万/ 40元1250万/' result = model.analysis(message, account_detect=False, url_detect=None, language_classify=False, rulebase_detect=False, model_predict=True, pinyin_mode=False) print(result) --------------result-------------- { 'message': '加微SH9709200 价格:7元105万/ 40元1250万/', 'label_distribution': [90.91, 0.0, 0.0, 0.0, 0.0, 9.09, 0.0], 'ban': True, 'final_label': 5, 'predict_result': [('加微', 0), ('SH9709200', 0), ('价格', 5), ('7', 0), ('元', 0), ('105', 0), ('万', 0), ('40', 0), ('元', 0), ('1250', 0), ('万', 0)], 'status': True } ``` ### pinyin_mode #### account_detect=False, url_detect=None, language_classify=False, rulebase_detect=False, model_predict=False, pinyin_mode=True ```=python message = '加微SH9709200 价格:7元105万/ 40元1250万/' result = model.analysis(account_detect=False, url_detect=None, language_classify=False, rulebase_detect=False, model_predict=False, pinyin_mode=True) print(result) --------------result-------------- { 'message': '加微SH9709200 价格:7元105万/ 40元1250万/', 'label_distribution': [0, 0, 0, 0, 0, 0, 0], 'ban': True, 'final_label': 5, 'status': True } --------------in predetect.txt-------------- 加微SH9709200 价格:7元105万/ 40元1250万/ -> ["拼音: ([['jia4'], ['ge2']]), Label: 5"] ``` ## batch_analysis ### Full function #### account_detect=True, url_detect='urlextract', language_classify=True, rulebase_detect=True, model_predict=True, pinyin_mode=True ```=python messages = ['他媽的','台灣加油','加微:SH9709200','http://reurl.cc/6666'] message_df = model.batch_analysis(messages, account_detect=True, url_detect='urlextract', language_classify=True, rulebase_detect=True, model_predict=True, pinyin_mode=True) print(message_df) --------------result-------------- message label_distribution ban final_label url_count url_list account_count account_list language predict_result 0 他妈的 [0, 0, 1, 0, 0, 0, 0] True 2 0 [] 0 zh [(他妈的, 2)] 1 台湾加油 [1, 1, 0, 0, 0, 0, 0] True 1 0 [] 0 zh [(台湾, 1), (加油, 0)] 2 加微:SH9709200 [0, 0, 0, 0, 0, 0, 0] True 5 0 [] 1 [SH9709200] zh [] 3 http://reurl.cc/6666 [0, 0, 0, 0, 0, 0, 0] True 5 1 [http://reurl.cc/6666] 0 fr [] } --------------in predetect.txt-------------- 他妈的 -> ["拼音: ([['ta1'], ['ma1']]), Label: 2", "拼音: ([['ma1'], ['de']]), Label: 2", "拼音: ([['ta1'], ['ma1'], ['de']]), Label: 2"] 台湾加油 -> ["拼音: ([['tai2'], ['wan1']]), Label: 1"] ``` ### account_detect #### account_detect=True, url_detect=None, language_classify=False, rulebase_detect=False, model_predict=False, pinyin_mode=False ```=python messages = ['他媽的','台灣加油','加微:SH9709200','http://reurl.cc/6666'] message_df = model.batch_analysis(messages, account_detect=True, url_detect=None, language_classify=False, rulebase_detect=False, model_predict=False, pinyin_mode=False) print(message_df) --------------result-------------- message label_distribution ban final_label account_count account_list 0 他妈的 [0, 0, 0, 0, 0, 0, 0] False 0 0 1 台湾加油 [0, 0, 0, 0, 0, 0, 0] False 0 0 2 加微:SH9709200 [0, 0, 0, 0, 0, 0, 0] True 5 1 [SH9709200] 3 http://reurl.cc/6666 [0, 0, 0, 0, 0, 0, 0] False 0 0 ``` ### url_detect #### account_detect=False, url_detect='urlextract', language_classify=False, rulebase_detect=False, model_predict=False, pinyin_mode=False ```=python messages = ['他媽的','台灣加油','加微:SH9709200','http://reurl.cc/6666'] message_df = model.batch_analysis(messages, account_detect=False, url_detect='urlextract', language_classify=False, rulebase_detect=False, model_predict=False, pinyin_mode=False) print(message_df) --------------result-------------- message label_distribution ban final_label url_count url_list 0 他妈的 [0, 0, 0, 0, 0, 0, 0] False 0 0 [] 1 台湾加油 [0, 0, 0, 0, 0, 0, 0] False 0 0 [] 2 加微:SH9709200 [0, 0, 0, 0, 0, 0, 0] False 0 0 [] 3 http://reurl.cc/6666 [0, 0, 0, 0, 0, 0, 0] True 5 1 [http://reurl.cc/6666] ``` ### language_classify #### account_detect=False, url_detect=None, language_classify=True, rulebase_detect=False, model_predict=False, pinyin_mode=False ```=python messages = ['他媽的','台灣加油','加微:SH9709200','http://reurl.cc/6666'] message_df = model.batch_analysis(messages, account_detect=False, url_detect=None, language_classify=True, rulebase_detect=False, model_predict=False, pinyin_mode=False) print(message_df) --------------result-------------- message label_distribution ban final_label language 0 他妈的 [0, 0, 0, 0, 0, 0, 0] False 0 zh 1 台湾加油 [0, 0, 0, 0, 0, 0, 0] False 0 zh 2 加微:SH9709200 [0, 0, 0, 0, 0, 0, 0] False 0 zh 3 http://reurl.cc/6666 [0, 0, 0, 0, 0, 0, 0] False 0 fr ``` ### rulebase_detect #### account_detect=False, url_detect=None, language_classify=False, rulebase_detect=True, model_predict=False, pinyin_mode=False ```=python messages = ['他媽的','台灣加油','加微:SH9709200','http://reurl.cc/6666'] message_df = model.batch_analysis(messages, account_detect=False, url_detect=None, language_classify=False, rulebase_detect=True, model_predict=False, pinyin_mode=False) print(message_df) --------------result-------------- message label_distribution ban final_label 0 他妈的 [0, 0, 0, 0, 0, 0, 0] False 0 1 台湾加油 [0, 0, 0, 0, 0, 0, 0] False 0 2 加微:SH9709200 [0, 0, 0, 0, 0, 0, 0] False 0 3 http://reurl.cc/6666 [0, 0, 0, 0, 0, 0, 0] False 0 ``` ### model_predict #### account_detect=False, url_detect=None, language_classify=False, rulebase_detect=False, model_predict=True, pinyin_mode=False ```=python messages = ['他媽的','台灣加油','加微:SH9709200','http://reurl.cc/6666'] message_df = model.batch_analysis(messages, account_detect=False, url_detect=None, language_classify=False, rulebase_detect=False, model_predict=True, pinyin_mode=False) print(message_df) --------------result-------------- message label_distribution ban final_label predict_result 0 他妈的 [0, 0, 1, 0, 0, 0, 0] True 2 [(他妈的, 2)] 1 台湾加油 [1, 1, 0, 0, 0, 0, 0] True 1 [(台湾, 1), (加油, 0)] 2 加微:SH9709200 [2, 0, 0, 0, 0, 0, 0] False 0 [(加微, 0), (SH9709200, 0)] 3 http://reurl.cc/6666 [5, 0, 0, 0, 0, 0, 0] False 0 [(http, 0), (re, 0), (url, 0), (cc, 0), (6666,... ``` ### pinyin_mode #### account_detect=False, url_detect=None, language_classify=False, rulebase_detect=False, model_predict=False, pinyin_mode=True ```=python messages = ['他媽的','台灣加油','加微:SH9709200','http://reurl.cc/6666'] message_df = model.batch_analysis(messages, account_detect=False, url_detect=None, language_classify=False, rulebase_detect=False, model_predict=False, pinyin_mode=True) print(message_df) --------------result-------------- message label_distribution ban final_label 0 他妈的 [0, 0, 0, 0, 0, 0, 0] False 0 1 台湾加油 [0, 0, 0, 0, 0, 0, 0] False 0 2 加微:SH9709200 [0, 0, 0, 0, 0, 0, 0] False 0 3 http://reurl.cc/6666 [0, 0, 0, 0, 0, 0, 0] False 0 --------------in predetect.txt-------------- 他妈的 -> ["拼音: ([['ta1'], ['ma1']]), Label: 2", "拼音: ([['ma1'], ['de']]), Label: 2", "拼音: ([['ta1'], ['ma1'], ['de']]), Label: 2"] 台湾加油 -> ["拼音: ([['tai2'], ['wan1']]), Label: 1"] ``` ## Precautions :::warning 1. model_predict 開啟時,若 account_detect or url_detect 也開啟,不會做模型的預測,此訊息會直接歸類至廣告(5) 2. pinyin_mode 需依賴 model_predict,才會做切詞預測後的拼音修正,單獨開啟只會做拼音的預偵測,其結果會例外存在 predetect.txt 裡 3. account_detect & url_detect 兩者皆採用即為廣告偵測的開關 :::