.. Kenneth Lee 版权所有 2019-2020 :Authors: Kenneth Lee :Version: 1.0 单元测试的强与弱问题 ********************* 最近看了一个项目做的单元测试,很不满意——单元测试框架做得太好了。 单元测试的框架不能做得太好,单元测试的主角是被测试的那个代码,不是单元测试的框 架,把单元测试框架做得太好,结果就是测试效果大打折扣。 这就好比你盖房子,脚手架搭得比房子还漂亮,房子有可能搭的好吗? 我们看一些细节,比如你测试Linux内核的一个模块,这个模块包含了 ,用到里面的struct task_struct {}。你应该怎么打桩?有人会找一个Linux版本,把这 个结构整个拷贝进来::: struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For reasons of header soup (see current_thread_info()), this * must be the first element of task_struct. */ struct thread_info thread_info; #endif /* -1 unrunnable, 0 runnable, >0 stopped: */ volatile long state; /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. */ randomized_struct_fields_start void *stack; atomic_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; #ifdef CONFIG_SMP struct llist_node wake_entry; int on_cpu; #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ unsigned int cpu; #endif unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; /* * recent_used_cpu is initially set as the last CPU used by a task * that wakes affine another task. Waker/wakee relationships can * push tasks around a CPU where each wakeup moves to the next one. * Tracking a recently used CPU allows a quick search for a recently * used CPU that may be idle. */ int recent_used_cpu; int wake_cpu; #endif int on_rq; int prio; int static_prio; int normal_prio; unsigned int rt_priority; const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif struct sched_dl_entity dl; #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif unsigned int policy; int nr_cpus_allowed; cpumask_t cpus_allowed; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; u8 rcu_tasks_holdout; u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; #endif /* #ifdef CONFIG_TASKS_RCU */ struct sched_info sched_info; struct list_head tasks; #ifdef CONFIG_SMP struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; #endif struct mm_struct *mm; struct mm_struct *active_mm; /* Per-thread vma caching: */ struct vmacache vmacache; #ifdef SPLIT_RSS_COUNTING struct task_rss_stat rss_stat; #endif int exit_state; int exit_code; int exit_signal; /* The signal sent when the parent dies: */ int pdeath_signal; /* JOBCTL_*, siglock protected: */ unsigned long jobctl; /* Used for emulating ABI behavior of previous Linux versions: */ unsigned int personality; /* Scheduler bits, serialized by scheduler locks: */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned sched_remote_wakeup:1; #ifdef CONFIG_PSI unsigned sched_psi_wake_requeue:1; #endif /* Force alignment to the next boundary: */ unsigned :0; /* Unserialized, strictly 'current' */ /* Bit to tell LSMs we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG unsigned in_user_fault:1; #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; #endif #ifdef CONFIG_BLK_CGROUP /* to be used once the psi infrastructure lands upstream. */ unsigned use_memdelay:1; #endif /* * May usercopy functions fault on kernel addresses? * This is not just a single bit because this can potentially nest. */ unsigned int kernel_uaccess_faults_ok; unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; pid_t pid; pid_t tgid; #ifdef CONFIG_STACKPROTECTOR /* Canary value for the -fstack-protector GCC feature: */ unsigned long stack_canary; #endif /* * Pointers to the (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ /* Real parent process: */ struct task_struct __rcu *real_parent; /* Recipient of SIGCHLD, wait4() reports: */ struct task_struct __rcu *parent; /* * Children/sibling form the list of natural children: */ struct list_head children; struct list_head sibling; struct task_struct *group_leader; /* * 'ptraced' is the list of tasks this task is using ptrace() on. * * This includes both natural children and PTRACE_ATTACH targets. * 'ptrace_entry' is this task's link on the p->parent->ptraced list. */ struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_group; struct list_head thread_node; struct completion *vfork_done; /* CLONE_CHILD_SETTID: */ int __user *set_child_tid; /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME u64 utimescaled; u64 stimescaled; #endif u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif /* Context switch counts: */ unsigned long nvcsw; unsigned long nivcsw; /* Monotonic time in nsecs: */ u64 start_time; /* Boot based time in nsecs: */ u64 real_start_time; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; #ifdef CONFIG_POSIX_TIMERS struct task_cputime cputime_expires; struct list_head cpu_timers[3]; #endif /* Process credentials: */ /* Tracer's credentials at attach: */ const struct cred __rcu *ptracer_cred; /* Objective and real subjective task credentials (COW): */ const struct cred __rcu *real_cred; /* Effective (overridable) subjective task credentials (COW): */ const struct cred __rcu *cred; /* * executable name, excluding path. * * - normally initialized setup_new_exec() * - access it with [gs]et_task_comm() * - lock it with task_lock() */ char comm[TASK_COMM_LEN]; struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC struct sysv_sem sysvsem; struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; /* Open file information: */ struct files_struct *files; /* Namespaces: */ struct nsproxy *nsproxy; /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; struct callback_head *task_works; struct audit_context *audit_context; #ifdef CONFIG_AUDITSYSCALL kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; /* Thread group tracking: */ u32 parent_exec_id; u32 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ spinlock_t alloc_lock; /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif #ifdef CONFIG_DEBUG_MUTEXES /* Mutex deadlock detection: */ struct mutex_waiter *blocked_on; #endif #ifdef CONFIG_TRACE_IRQFLAGS unsigned int irq_events; unsigned long hardirq_enable_ip; unsigned long hardirq_disable_ip; unsigned int hardirq_enable_event; unsigned int hardirq_disable_event; int hardirqs_enabled; int hardirq_context; unsigned long softirq_disable_ip; unsigned long softirq_enable_ip; unsigned int softirq_disable_event; unsigned int softirq_enable_event; int softirqs_enabled; int softirq_context; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif #ifdef CONFIG_UBSAN unsigned int in_ubsan; #endif /* Journalling filesystem info: */ void *journal_info; /* Stacked block device info: */ struct bio_list *bio_list; #ifdef CONFIG_BLOCK /* Stack plugging: */ struct blk_plug *plug; #endif /* VM state: */ struct reclaim_state *reclaim_state; struct backing_dev_info *backing_dev_info; struct io_context *io_context; /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; #ifdef CONFIG_PSI /* Pressure stall state */ unsigned int psi_flags; #endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; /* Accumulated virtual memory usage: */ u64 acct_vm_mem1; /* stime + utime since last update: */ u64 acct_timexpd; #endif #ifdef CONFIG_CPUSETS /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Seqence number to catch updates: */ seqcount_t mems_allowed_seq; int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; #endif #ifdef CONFIG_INTEL_RDT u32 closid; u32 rmid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif #ifdef CONFIG_PERF_EVENTS struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; struct mutex perf_event_mutex; struct list_head perf_event_list; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; unsigned long numa_migrate_retry; /* Migration stamp: */ u64 node_stamp; u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; struct numa_group *numa_group; /* * numa_faults is an array split into four regions: * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer * in this precise order. * * faults_memory: Exponential decaying average of faults on a per-node * basis. Scheduling placement decisions are made based on these * counts. The values remain static for the duration of a PTE scan. * faults_cpu: Track the nodes the process was running on when a NUMA * hinting fault was incurred. * faults_memory_buffer and faults_cpu_buffer: Record faults per node * during the current scan window. When the scan completes, the counts * in faults_memory and faults_cpu decay and these values are copied. */ unsigned long *numa_faults; unsigned long total_numa_faults; /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults */ unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_RSEQ struct rseq __user *rseq; u32 rseq_len; u32 rseq_sig; /* * RmW on rseq_event_mask must be performed atomically * with respect to preemption. */ unsigned long rseq_event_mask; #endif struct tlbflush_unmap_batch tlb_ubc; struct rcu_head rcu; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call * balance_dirty_pages() for a dirty throttling pause: */ int nr_dirtied; int nr_dirtied_pause; /* Start of a write-and-pause period: */ unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif /* * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ u64 timer_slack_ns; u64 default_timer_slack_ns; #ifdef CONFIG_KASAN unsigned int kasan_depth; #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; /* Stack of return addresses for return function tracing: */ struct ftrace_ret_stack *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; /* * Number of functions that haven't been traced * because of depth overrun: */ atomic_t trace_overrun; /* Pause tracing: */ atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* State flags for use by tracers: */ unsigned long trace; /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; /* Size of the kcov_area: */ unsigned int kcov_size; /* Buffer for coverage collection: */ void *kcov_area; /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; #endif #ifdef CONFIG_MEMCG struct mem_cgroup *memcg_in_oom; gfp_t memcg_oom_gfp_mask; int memcg_oom_order; /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; #endif #ifdef CONFIG_BLK_CGROUP struct request_queue *throttle_queue; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) unsigned int sequential_io; unsigned int sequential_io_avg; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; #endif int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ atomic_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; #endif #ifdef CONFIG_SECURITY /* Used by LSM modules for access restriction: */ void *security; #endif #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; unsigned long prev_lowest_stack; #endif /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end /* CPU-specific state of this task: */ struct thread_struct thread; /* * WARNING: on x86, 'thread_struct' contains a variable-sized * structure. It *MUST* be at the end of 'task_struct'. * * Do not put anything below here! */ }; 够长吧?——不够。 这个结构里面还用了用到了thread_info这个结构呢,然后接着把thread_info也定义进来 ,thread_info又用了另一个结构呢,你又把里面那个解构定义进来?——疯了。 哪里需要这么复杂呢,这样定义就好了::: struct task_struct {}; 然后你用到哪个成员,往里面补一个int member就结了。打桩,打桩。你打的是桩,不是 真东西。你盖的是脚手架,不是房子啊。脚手架的投资比房子大,房子的建设必然受脚手 架的影响的啊。 你桩做得很精巧,很多用例肯定就上不去的呀。(注1) 再来一个例子。比如你的程序调用了remap_pfn_range(),这个桩应该怎么打?有人会这样 打::: int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { return 0; } 真是吃饱没事干,你都没有放用例进去呢,这么复杂干什么?这样就可以啦::: #define remap_pfn_range(...) 0 还有kmalloc应该怎么打?显然你不需要这样::: static __always_inline void *kmalloc(size_t size, gfp_t flags) { return malloc(size); } 这样就可以啦::: char * testcase101_mem[1024]; void *kmalloc(...) { if (testcase==101) return testcase101_mem; return NULL; } 这才是打桩。你要测试的是你的程序,你要全部聚焦到你的程序的: 在各种极端的情况下,查找有没有遗漏?资源有没有泄漏?计算结果是否正确?等等等等 。这些逻辑才是核心,核心的不是你的测试桩。比如下面这个程序::: static inline int my_iommu_map(struct queue *q, struct qfile_region *qfr) { struct device *dev = q->myhw->pdev; struct iommu_domain *domain = iommu_get_domain_for_dev(dev); int i, j, ret; if (!domain) return -ENODEV; for (i = 0; i < qfr->nr_pages; i++) { get_page(qfr->pages[i]); ret = iommu_map(domain, qfr->iova + i * PAGE_SIZE, page_to_phys(qfr->pages[i]), PAGE_SIZE, qfr->prot | q->uacce->prot); if (ret) { dev_err(dev, "iommu_map page %i fail %d\n", i, ret); goto err_with_map_pages; } } return 0; err_with_map_pages: for (j = i-1; j >= 0; j--) { iommu_unmap(domain, qfr->iova + j * PAGE_SIZE, PAGE_SIZE); put_page(qfr->pages[j]); } return ret; } 你看,单元测试的聚焦点应该是:nr_pages是0, 1, 10的时候,这个流程是不是正确的 。iommu_map()在第一次,中间,最后一次的时候失败的话,是不是所有map过的page都被 释放了,所有page没有被get过。 (实际上,这个流程是有错误的,但肉眼很难看出来,但单元测试可以很轻易找出这个错 误) 而至于page_to_phys()怎么实现,关你鬼事。那是另一个系统实现得对不对,或者你对那 个系统的预期或者理解对不对的问题,那是集成测试的范畴,不是单元测试的范畴。 写程序的时候我很强调每个语句的“语义”,但单元测试的时候,我们要彻底忘掉“语义”, 眼中只有“循环”,“赋值”,“退出”这样的概念。在这个时候,dev_err不是打印, put_page()不是释放,它们都只是流程中的一个“经过点”。你不是要模拟一个真正的错误 日志或者内存释放。你要验证你设计的流程,是不是按你设定的路径在走。 关于单元测试本身的观点,我表达完了,最后说说构架和哲学。很多人觉得,我“严格要求 ”,事事做到“尽善尽美”肯定是没有错的。这是一个很生动的例子:你的观点是不对的。天 之道,高者抑之,下者举之。把事情做成是要动脑子的,是要“准”的,不是要“极致”的。 而不确定目的,无所谓“准”。 所以,我们要警惕两个设计中的常见误区。第一,不定义目的,不考虑目的,只要“努力”。 第二,寻求表面的好看:好比前面我提到的测试框架。开始的时候其实不是这样的,但发 展一段时间以后,或者独立交个一个团队后,这个团队要展现自己的“能力”,“绩效”,就 忍不住要炫技了。对那个团队来说,这个无可厚非。但对架构师来说,让非主角“好看”,“ 强”,就是让主业务“失去关注和重点”,“弱”。强弱的转变,可以发生得很快。 九层之台,起于垒土。在垒土的阶段就应该呈现为垒土,如果这个阶段就呈现为高台,这 个台就垒不起来。 注1:关于是否自己mock所有的头文件的问题 我基于本文和一个实际的例子在部门内部做了一个培训,现场不少人对不使用Linux的全集 头文件来做桩有不同意见。 这个问题首先这样说:你如果能保证聚焦到把各种极端流程都验证过,你怎么打桩,这个 不是关键问题。我们的重点不在这里。我的关注点是你把头文件搭得太好了,你失去了从 结构,定义这些角度注入数据的能力了。如果你的情形是都能解决,我没有意见。 其次是这样,很多人可能有一个误区,觉得这样打桩法,工作量很大。其实你要试一试, 我的经验是,每个模块的对外接口都是有限的,这个工作量其实相当有限。 最后说点玄学。我大学的时候看了很多黄易的武侠小说,里面的超级高手,常常在大战前 做一件事:为自己削一根长矛,或者打一把剑。以保证自己在大战前,对自己的武器有一 个“点点滴滴”的认识。 我个人觉得自己为每个用到的外部接口打桩,作用也在这里。单元测试的目的,说到底是 提升对自己代码的信心。你一两万行的代码,几十个函数,变成一个模块,然后黑盒地扔 到机架上,BIOS中,然后你一跑,有数据出来了。你知道里面经历了什么?该走的流程有 没有走过,各种边界有没有碰过? UT是一个削长矛的过程,而确切知道,你自己的模块对各种结构的定义是什么。是这个削 长矛过程的一部分,这个必要性,你做一次就发现了。 这是我的经验。 补充2019/5/10: 这个文档上一次编辑的时间是2018年12月,我写完后,对应团队没有接受我的意见,继续 按原来的方案做UT,我给他们留了一句话:你们这样玩,撑不过一年。现在过了不到半年 ,他们的测试例都废了……这个事情证明了我的观点,但我一点都高兴不起来,因为这些资 源一定程度上也是我的。认知是成本,不过如此。