# IOT Malware ## feature selection ### library 我先分析了在.json檔中,data常引用的library,總共出現的library為617個,但發現實際有引用library的data相對較少,許多library只出現1、2次,為了減少feature的數量,我只選擇其中出現次數超過1000次的library(總data數接近50000),並分析這些library的用途。 | library | number | | ------------ | ------ | | libc.so.0 | 1879 | | libc.so.6 | 2494 | | libm.so | 1254 | | libdl.so | 1218 | | liblog.so | 1064 | | libc.so | 1280 | | libstdc++.so | 1217 | #### libc.so 其為standard C library,可推測此程式是以C語言來撰寫。 #### libm.so 為c math 的library #### libdl.so 為dynamic-link-library,即動態連結函示庫,可以不須一次將library的程式都放入記憶體中,提高記憶體效率。 #### liblog.so 還不到非常了解,但應該是作為登入使用的函式庫 #### 來計算library出現次數的程式碼 ``` import json import os library=[] path = 'C:\IOT malware\dataset\dataset' for filename in os.listdir(path): if(filename!='ffff9e89f724a81de3a52b640d9e719d.json'): with open(filename) as f: data=json.load(f) for lib in data['libraries']: library.append(lib) else: for i in set(library): if(library.count(i) > 1000): print(i,library.count(i)) ``` --- 可能不需要太多feature,剛剛發現在data中,除了gafgyt,tsunami,mirai,其他malware都沒有被特別分類,故我們大概只需分類其中的4類就好。 --- ### function call 分析function中,在linux檔案中選擇有被引用超過10000次的function(總共有37079種) :::spoiler execve __pthread_return_void inet_addr raise __GI_strchr __GI_inet_addr __GI_getgid __GI_fcntl recvLine __GI_fseeko64 isatty getegid __GI_inet_ntoa setsockopt _vfprintf_internal __GI_inet_ntoa_r prints nanosleep sendSTD __pthread_mutex_init __GI_wait4 __GI___errno_location fgets_unlocked rand __GI_kill __GI_rawmemchr sockprintf zprintf __fgetc_unlocked __GI_memcpy access __vfork __encode_question __GI_strcasecmp chdir memmove __GI___uClibc_init __GI_tolower __GI___fgetc_unlocked calloc fclose lseek64 getOurIP __GI_sendto sigaction sbrk memset wildString strcpy __stdio_rfill memrchr _ppfs_setargs strchr __heap_alloc __decode_header sendto __GI_ioctl __GI_exit srand __GI_poll __GI_strncpy mmap __GI_wcsrtombs __GI_strstr __GI_munmap system __GI_write poll signal __GI_initstate_r __stdio_seek makeIPPacket kill __GI_getpid __GI_fclose __stdio_trans2r_o rand_cmwc __GI_getpagesize munmap recv read dup2 __GI_fgets_unlocked __GI___glibc_strerror_r frame_dummy __GI_memchr getBuild malloc __libc_getpid __GI_getrlimit __pthread_mutex_lock __open_etc_hosts main atoi getpagesize __stdio_adjust_position __GI_fgets write __GI_getc_unlocked geteuid __GI_atol __GI_strtol getgid __GI_fseek __GI_raise socket_connect __libc_fork __stdio_READ __encode_dotted negotiate _fpmaxtostr random __GI_strspn strncat strspn gethostbyname free abort __heap_free strpbrk __GI_memmove rawmemchr toupper __GI_strdup processCmd __open_nameservers __GI_brk listFork __GI___xpg_strerror_r __GI_wcrtomb __sigaddset htonl connect __GI_getuid __GI_getdtablesize print getc_unlocked __GI_gethostbyname_r fdgets __GI_strpbrk __GI_sigaddset __GI_fopen __do_global_dtors_aux __GI_sigemptyset __libc_recv __GI_mempcpy _fini __GI_strnlen free_mem initstate_r strnlen execl _pthread_cleanup_push_defer __GI_sigaction __pthread_mutex_unlock strtok __uClibc_fini getsockname close random_r __libc_read fflush_unlocked open __sigismember __GI_memrchr __GI_vsnprintf __GI_memset __GI_time __GI_fputs_unlocked __malloc_trim __xpg_strerror_r isspace memcpy __GI_inet_aton _stdio_openlist_dec_use __libc_sigaction __GI_strcmp __libc_fcntl sclose makeRandomStr __pthread_mutex_trylock __encode_header fseeko __GI_execl wcsnrtombs __GI_strtok_r __stdio_init_mutex __libc_write __GI_socket __libc_system fseeko64 vsnprintf __GI_strtok __GI_close __sigdelset matchPrompt ::: 而在android中只選擇超過3000次的function(總共有235566種) :::spoiler sleep _start getpid close exit time memset connect read strlen socket inet_addr kill open free fork __errno_location strcpy write select malloc abort atoi memcpy ::: 來計算function出現次數的程式碼 ``` import os fun =[] path = "C:\IOT malware\/functions\/functions\/android" for filename in os.listdir(path): if(filename!="tempCodeRunnerFile.python") : f = open(filename,'r') for line in f: fun.append(line) f.close for i in set(fun): if(fun.count(i)>3000): print(i) f.close() ``` ### 特徵選擇 最終選擇 * architecture * androidMal * static * stripped * fcns_counts * 和上述出現頻率較高的library和function call 擷取feature的code ``` import os import json func_list = ['sleep', '_start', 'getpid', 'close', 'exit', 'time', 'memset', 'connect', 'read', 'strlen', 'socket', 'inet_addr', 'kill', 'open', 'free', 'fork', '__errno_location', 'strcpy', 'write', 'select', 'malloc', 'abort', 'atoi', 'memcpy', 'execve', '__pthread_return_void', 'raise', '__GI_strchr', '__GI_inet_addr', '__GI_getgid', '__GI_fcntl', 'recvLine', '__GI_fseeko64', 'isatty', 'getegid', '__GI_inet_ntoa', 'setsockopt', '_vfprintf_internal', '__GI_inet_ntoa_r', 'prints', 'nanosleep', 'sendSTD', '__pthread_mutex_init', '__GI_wait4', '__GI___errno_location', 'fgets_unlocked', 'rand', '__GI_kill', '__GI_rawmemchr', 'sockprintf', 'zprintf', '__fgetc_unlocked', '__GI_memcpy', 'access', '__vfork', '__encode_question', '__GI_strcasecmp', 'chdir', 'memmove', '__GI___uClibc_init', '__GI_tolower', '__GI___fgetc_unlocked', 'calloc', 'fclose', 'lseek64', 'getOurIP', '__GI_sendto', 'sigaction', 'sbrk', 'wildString', '__stdio_rfill', 'memrchr', '_ppfs_setargs', 'strchr', '__heap_alloc', '__decode_header', 'sendto', '__GI_ioctl', '__GI_exit', 'srand', '__GI_poll', '__GI_strncpy', 'mmap', '__GI_wcsrtombs', '__GI_strstr', '__GI_munmap', 'system', '__GI_write', 'poll', 'signal', '__GI_initstate_r', '__stdio_seek', 'makeIPPacket', '__GI_getpid', '__GI_fclose', '__stdio_trans2r_o', 'rand_cmwc', '__GI_getpagesize', 'munmap', 'recv', 'dup2', '__GI_fgets_unlocked', '__GI___glibc_strerror_r', 'frame_dummy', '__GI_memchr', 'getBuild', '__libc_getpid', '__GI_getrlimit', '__pthread_mutex_lock', '__open_etc_hosts', 'main', 'getpagesize', '__stdio_adjust_position', '__GI_fgets', '__GI_getc_unlocked', 'geteuid', '__GI_atol', '__GI_strtol', 'getgid', '__GI_fseek', '__GI_raise', 'socket_connect', '__libc_fork', '__stdio_READ', '__encode_dotted', 'negotiate', '_fpmaxtostr', 'random', '__GI_strspn', 'strncat', 'strspn', 'gethostbyname', '__heap_free', 'strpbrk', '__GI_memmove', 'rawmemchr', 'toupper', '__GI_strdup', 'processCmd', '__open_nameservers', '__GI_brk', 'listFork', '__GI___xpg_strerror_r', '__GI_wcrtomb', '__sigaddset', 'htonl', '__GI_getuid', '__GI_getdtablesize', 'print', 'getc_unlocked', '__GI_gethostbyname_r', 'fdgets', '__GI_strpbrk', '__GI_sigaddset', '__GI_fopen', '__do_global_dtors_aux', '__GI_sigemptyset', '__libc_recv', '__GI_mempcpy', '_fini', '__GI_strnlen', 'free_mem', 'initstate_r', 'strnlen', 'execl', '_pthread_cleanup_push_defer', '__GI_sigaction', '__pthread_mutex_unlock', 'strtok', '__uClibc_fini', 'getsockname', 'random_r', '__libc_read', 'fflush_unlocked', '__sigismember', '__GI_memrchr', '__GI_vsnprintf', '__GI_memset', '__GI_time', '__GI_fputs_unlocked', '__malloc_trim', '__xpg_strerror_r', 'isspace', '__GI_inet_aton', '_stdio_openlist_dec_use', '__libc_sigaction', '__GI_strcmp', '__libc_fcntl', 'sclose', 'makeRandomStr', '__pthread_mutex_trylock', '__encode_header', 'fseeko', '__GI_execl', 'wcsnrtombs', '__GI_strtok_r', '__stdio_init_mutex', '__libc_write', '__GI_socket', '__libc_system', 'fseeko64', 'vsnprintf', '__GI_strtok', '__GI_close', '__sigdelset', 'matchPrompt'] library_list = ['libc.so.0', 'libc.so.6', 'libm.so', 'libdl.so', 'liblog.so', 'libc.so', 'libstdc++.so'] one_hot = list() path = 'C:\IOT malware\/functions\/functions\linux' for txt in os.listdir(path): k = list() data_j = txt[:-4] with open(data_j+'.json') as f: data = json.load(f) k.append(data['label']) k.append(data['architecture']) if(data['androidMal'] ): k.append(1) else: k.append(0) if(data['static']): k.append(1) else: k.append(0) if(data['stripped'] ): k.append(1) else: k.append(0) k.append(data['fcns_counts']) for li in library_list: if li in data['libraries']: k.append(1) else: k.append(0) f.close() with open('C:\IOT malware\/functions\/functions\linux\/'+txt, 'r') as tf: data_list = tf.read().split('\n') for function in func_list: if(function in data_list): k.append(1) else: k.append(0) k.append(txt) one_hot.append(k) tf.close() path = 'C:\IOT malware\/functions\/functions\/android' for txt in os.listdir(path): k = list() data_j = txt[:-4] with open(data_j+'.json') as f: data = json.load(f) k.append(data['label']) k.append(data['architecture']) if(data['androidMal'] ): k.append(1) else: k.append(0) if(data['static'] ): k.append(1) else: k.append(0) if(data['stripped'] ): k.append(1) else: k.append(0) k.append(data['fcns_counts']) for li in library_list: if li in data['libraries']: k.append(1) else: k.append(0) f.close() with open('C:\IOT malware\/functions\/functions\/android\/'+txt, 'r') as tf: data_list = tf.read().split('\n') for function in func_list: if(function in data_list): k.append(1) else: k.append(0) k.append(txt) one_hot.append(k) tf.close() print(one_hot) path = 'C:\IOT malware\/functions\/functions/one_hot.txt' output = open(path, 'w') for i in range(len(one_hot)): for j in range(len(one_hot[i])): output.write(str(one_hot[i][j])) output.write('\t') output.write('\n') output.close() ``` ## training https://github.com/Aesop-programmer/ML/blob/main/train_iot_malware_1.ipynb 利用上述的特徵用lightgbm跑,其中architecture,func_list和libraries以one_hot_encoder的方式呈現,而使用的資料式同時包含android和linux的ELF檔,目前得到的 準確率為0.943 feature importance前10名為 ![](https://i.imgur.com/oAyAueP.png) 發現與function_call和library幾乎沒有關係,但從允中助教所做的結果觀察,function_call應該非常重要,可能需要在了解。 ## 目前問題與未來規劃(2022/02/07) * 更加熟悉lightgbm,目前基本上都是修改網路上的例子,對lightgbm的各項參數還未熟悉,目標為熟悉更各項參數,和選擇訓練的model原理。 * 瞭解data中其他的feature,如xx_info,想辦法將其轉成可用feature,和嘗試各種feature選法 * 瞭解學長所做的domain adaptation,與助教如何轉換某些feature,和為何利用這麼多feature(90892,比data還多)卻沒有造成overfitting。