Try   HackMD

File Descriptor

延伸閱讀: 「一切皆為檔案」的理念與解讀

檔案描述符(file descriptor)就是一個整數,它對每個程序(process)都是私有的,在 UNIX 系統中用來存取檔案。因此,一旦檔案被開啟,只要你有權限,你就可以透過這個檔案描述符去讀取或寫入檔案。 從這個角度來看,file descriptor 是一種「能力」(capability) —— 它是一個不透明的操作控制代碼,可以賦予你執行某些特定操作的能力。

也可以把 file descriptor 想像成一個指向 file 類型物件的指標,一旦你持有這個物件,你就能使用像 read() 和 write() 這些「方法」來存取檔案。在 UNIX 系統中,每個 process 的 proc 結構中都會維護一個簡單的資料結構(例如陣列)透過 file descriptor 作為索引,來追蹤目前有哪些檔案是被這個程序開啟的。這個陣列的每一個元素其實就是一個指向 struct file 的指標,用來儲存目前正在讀寫的檔案的詳細資訊

Image Not Showing Possible Reasons
  • The image was uploaded to a note which you don't have access to
  • The note which the image was originally uploaded to has been deleted
Learn More →

source

tips

可以用 strace 來追蹤程式呼叫了哪些系統呼叫(system call)、傳遞了什麼參數、回傳了什麼值

  • -f 可以追蹤 fork 出來的子程序;
  • -t 會列出每個呼叫發生的時間;
  • -e trace=open,close,read,write 只追蹤這幾個系統呼叫,忽略其他的。

範例

#include <fcntl.h>     // open()
#include <unistd.h>    // read(), write(), close()
#include <stdio.h>     // perror()

int main() {
    char buffer[128];
    int fd = open("test.txt", O_RDONLY);
    if (fd < 0) {
        perror("open");
        return 1;
    }

    ssize_t bytes_read = read(fd, buffer, sizeof(buffer) - 1);
    if (bytes_read < 0) {
        perror("read");
        return 1;
    }

    buffer[bytes_read] = '\0';  // null terminate
    write(STDOUT_FILENO, buffer, bytes_read);

    close(fd);
    return 0;
}

接著執行

echo "Hello from file descriptor!" > test.txt
gcc -o fd_example fd_example.c
$ strace -e trace=openat,read,write,close ./fd_example
openat(AT_FDCWD, "/usr/local/cuda-11.8/lib64/glibc-hwcaps/x86-64-v3/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/local/cuda-11.8/lib64/glibc-hwcaps/x86-64-v2/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/local/cuda-11.8/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "glibc-hwcaps/x86-64-v3/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "glibc-hwcaps/x86-64-v2/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
close(3)                                = 0
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\220\243\2\0\0\0\0\0"..., 832) = 832
close(3)                                = 0
openat(AT_FDCWD, "test.txt", O_RDONLY)  = 3
read(3, "Hello from file descriptor!\n", 127) = 28
write(1, "Hello from file descriptor!\n", 28Hello from file descriptor!
) = 28
close(3)                                = 0
+++ exited with 0 +++

open() 被底層實作成 openat(),所以 trace 參數要寫 openat。

  • int fd = open("test.txt", O_RDONLY); => openat(AT_FDCWD, "test.txt", O_RDONLY) = 3 : = 3 表示這次開啟成功,系統回傳 file descriptor 3
  • read(fd, buffer, sizeof(buffer) - 1); => read(3, "Hello from file descriptor!\n", 127) = 28 : fd = 3,成功從 test.txt 讀了 28 byte; 存到 buffer[] 裡
  • write(STDOUT_FILENO, buffer, bytes_read); => write(1, "Hello from file descriptor!\n", 28) = 28 : STDOUT_FILENO 是 1;把剛剛讀到的 28 個字元寫到標準輸出(螢幕);成功寫了 28 bytes。
  • close(fd); => close(3) = 0 : 關閉 file descriptor 3;回傳 0 表示關閉成功。

為了要觀察更詳細資訊,在 close(fd) 之前加一行 sleep(120)

$ ls -l /proc/<PID>/fd/
total 0
lrwx------ 1 neat neat 64 Apr 23 11:27 0 -> /dev/pts/3
lrwx------ 1 neat neat 64 Apr 23 11:27 1 -> /dev/pts/3
l-wx------ 1 neat neat 64 Apr 23 11:27 19 -> /home/neat/.vscode-server/data/logs/20250423T093438/remoteagent.log
lrwx------ 1 neat neat 64 Apr 23 11:27 2 -> /dev/pts/3
l-wx------ 1 neat neat 64 Apr 23 11:27 20 -> /home/neat/.vscode-server/data/logs/20250423T093438/ptyhost.log
lrwx------ 1 neat neat 64 Apr 23 11:27 21 -> /dev/ptmx
l-wx------ 1 neat neat 64 Apr 23 11:27 22 -> /home/neat/.vscode-server/data/logs/20250423T093438/remoteTelemetry.log
lrwx------ 1 neat neat 64 Apr 23 11:27 23 -> /dev/ptmx
lrwx------ 1 neat neat 64 Apr 23 11:27 24 -> /dev/ptmx
lr-x------ 1 neat neat 64 Apr 23 11:27 3 -> /home/neat/YCL/Workspace/c_test/test.txt

可看到最後一行 fd = 3,即是我在程式中 open() 的檔案

$ cat /proc/1609218/fdinfo/3
pos:    28
flags:  0100000
mnt_id: 31
ino:    37501928
  • pos: 目前的檔案位移位置(offset),這裡是 28,代表你的程式已經從檔案讀了 28 bytes。
  • flags:
  • mnt_id:
  • ino: 檔案的 inode number

範例:

prompt> echo hello > foo
prompt> cat foo
hello
prompt> strace cat foo
...
openat(AT_FDCWD, "foo", O_RDONLY)       = 3
fstat(3, {st_mode=S_IFREG|0664, st_size=6, ...}) = 0
fadvise64(3, 0, 0, POSIX_FADV_SEQUENTIAL) = 0
mmap(NULL, 139264, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x70950e2bd000
read(3, "hello\n", 131072)              = 6
write(1, "hello\n", 6hello
)                  = 6
read(3, "", 131072)                     = 0
munmap(0x70950e2bd000, 139264)          = 0
close(3)                                = 0
...

cat 程式第一步會開啟檔案來進行讀取。我們可以發現: 檔案是以唯讀(read-only)方式開啟的,這由 O_RDONLY 旗標所示;open() 成功並回傳了一個檔案描述符(fd)3。

當 open() 一個新的檔案時(像 cat 這樣),它幾乎一定會得到 fd 3,這是因為每個正在執行的 process 都有三個預設已開啟的檔案: stdin(fd 0); stdout(fd 1); stderr(fd 2)

Linux code

struct fdtable {
	unsigned int max_fds;
	struct file __rcu **fd;      /* current fd array */
	unsigned long *close_on_exec;
	unsigned long *open_fds;
	unsigned long *full_fds_bits;
	struct rcu_head rcu;
};

file: https://github.com/torvalds/linux/blob/master/include/linux/fdtable.h

/*
 * Open file table structure
 */
struct files_struct {
  /*
   * read mostly part
   */
	atomic_t count;
	bool resize_in_progress;
	wait_queue_head_t resize_wait;

	struct fdtable __rcu *fdt;
	struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
	spinlock_t file_lock ____cacheline_aligned_in_smp;
	unsigned int next_fd;
	unsigned long close_on_exec_init[1];
	unsigned long open_fds_init[1];
	unsigned long full_fds_bits_init[1];
	struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

file: https://github.com/torvalds/linux/blob/master/include/linux/fdtable.h

/**
 * struct file - Represents a file
 * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
 * @f_mode: FMODE_* flags often used in hotpaths
 * @f_op: file operations
 * @f_mapping: Contents of a cacheable, mappable object.
 * @private_data: filesystem or driver specific data
 * @f_inode: cached inode
 * @f_flags: file flags
 * @f_iocb_flags: iocb flags
 * @f_cred: stashed credentials of creator/opener
 * @f_owner: file owner
 * @f_path: path of the file
 * @f_pos_lock: lock protecting file position
 * @f_pipe: specific to pipes
 * @f_pos: file position
 * @f_security: LSM security context of this file
 * @f_wb_err: writeback error
 * @f_sb_err: per sb writeback errors
 * @f_ep: link of all epoll hooks for this file
 * @f_task_work: task work entry point
 * @f_llist: work queue entrypoint
 * @f_ra: file's readahead state
 * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
 * @f_ref: reference count
 */
struct file {
	spinlock_t			f_lock;
	fmode_t				f_mode;
	const struct file_operations	*f_op;
	struct address_space		*f_mapping;
	void				*private_data;
	struct inode			*f_inode;
	unsigned int			f_flags;
	unsigned int			f_iocb_flags;
	const struct cred		*f_cred;
	struct fown_struct		*f_owner;
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct path			f_path;
	union {
		/* regular files (with FMODE_ATOMIC_POS) and directories */
		struct mutex		f_pos_lock;
		/* pipes */
		u64			f_pipe;
	};
	loff_t				f_pos;
#ifdef CONFIG_SECURITY
	void				*f_security;
#endif
	/* --- cacheline 2 boundary (128 bytes) --- */
	errseq_t			f_wb_err;
	errseq_t			f_sb_err;
#ifdef CONFIG_EPOLL
	struct hlist_head		*f_ep;
#endif
	union {
		struct callback_head	f_task_work;
		struct llist_node	f_llist;
		struct file_ra_state	f_ra;
		freeptr_t		f_freeptr;
	};
	file_ref_t			f_ref;
	/* --- cacheline 3 boundary (192 bytes) --- */
} __randomize_layout
  __attribute__((aligned(4)));	/* lest something weird decides that 2 is OK */

file: https://github.com/torvalds/linux/blob/master/include/linux/fs.h


file:https://github.com/torvalds/linux/blob/master/include/linux/sched.h

reference

https://wiyi.org/linux-file-descriptor.html