events/0 linux
2018-07-31 15:20:38 pwl999 阅读数 178

可以通过perf list命令来查看系统中的software event:

# simpleperf list sw
List of software events:
  cpu-clock
  task-clock
  page-faults
  context-switches
  cpu-migrations
  minor-faults
  major-faults
  alignment-faults
  emulation-faults

1、原理介绍:

software pmu的trace数据抓取分为两类:“cpu-clock”、“task-clock”为采样法,其他为插桩法。

enum perf_sw_ids {
    PERF_COUNT_SW_CPU_CLOCK         = 0,
    PERF_COUNT_SW_TASK_CLOCK        = 1,
    PERF_COUNT_SW_PAGE_FAULTS       = 2,
    PERF_COUNT_SW_CONTEXT_SWITCHES      = 3,
    PERF_COUNT_SW_CPU_MIGRATIONS        = 4,
    PERF_COUNT_SW_PAGE_FAULTS_MIN       = 5,
    PERF_COUNT_SW_PAGE_FAULTS_MAJ       = 6,
    PERF_COUNT_SW_ALIGNMENT_FAULTS      = 7,
    PERF_COUNT_SW_EMULATION_FAULTS      = 8,
    PERF_COUNT_SW_DUMMY         = 9,
    PERF_COUNT_SW_BPF_OUTPUT        = 10,

    PERF_COUNT_SW_MAX,          /* non-ABI */
};

1.1、插桩法

插桩法的software event和tracepoint event本质上是一样的,只是tracepoint event是复用trace_event,而software event是重新创建的插桩点。

  • 1、“page-faults”的插桩点:

el0_sync() -> el0_da() -> do_mem_abort() -> do_mem_abort() -> do_page_fault():

static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
                   struct pt_regs *regs)
{

    /* “context-switches”的插桩点,被命中时上报数据 */
    perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);

}
  • 2、“context-switches”的插桩点:

context_switch() -> prepare_task_switch() -> perf_event_task_sched_out():

static inline void perf_event_task_sched_out(struct task_struct *prev,
                         struct task_struct *next)
{
    /* “context-switches”的插桩点,被命中时上报数据 */
    perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);

    if (static_key_false(&perf_sched_events.key))
        __perf_event_task_sched_out(prev, next);
}
  • 3、“cpu-migrations”的插桩点:

context_switch() -> finish_task_switch() -> perf_event_task_sched_in():

static inline void perf_event_task_sched_in(struct task_struct *prev,
                        struct task_struct *task)
{
    if (static_key_false(&perf_sched_events.key))
        __perf_event_task_sched_in(prev, task);

    /* “cpu-migrations”的插桩点,被命中时上报数据 */
    if (perf_sw_migrate_enabled() && task->sched_migrated) {
        struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);

        perf_fetch_caller_regs(regs);
        ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
        task->sched_migrated = 0;
    }
}
  • 4、“minor-faults”&“major-faults”的插桩点:

el0_sync() -> el0_da() -> do_mem_abort() -> do_mem_abort() -> do_page_fault():

static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
                   struct pt_regs *regs)
{

    perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
    if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
        if (fault & VM_FAULT_MAJOR) {
            tsk->maj_flt++;
            /* “major-faults”的插桩点,被命中时上报数据 */
            perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
                      addr);
        } else {
            tsk->min_flt++;
            /* “minor-faults”的插桩点,被命中时上报数据 */
            perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
                      addr);
        }
        if (fault & VM_FAULT_RETRY) {
            /*
             * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
             * starvation.
             */
            mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
            mm_flags |= FAULT_FLAG_TRIED;
            goto retry;
        }
    }

}
  • 5、“alignment-faults”的插桩点:

arm64没有插桩点

  • 6、“emulation-faults”的插桩点:

cp15barrier_handler():

static int cp15barrier_handler(struct pt_regs *regs, u32 instr)
{
    perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, regs->pc);

}

1.2、采样法

采样法是间隔多少事件去采样,事件可以是指令数、时间、cache miss次数等。一般使用专门的硬件pmu来采样,“cpu-clock”、“task-clock”使用软件的hrtimer来采样。

在硬件PMU不支持的情况下,也可以参考hrtimer的采样值。

2、event init

2.1、”software”

static struct pmu perf_swevent = {
    .task_ctx_nr    = perf_sw_context,

    .capabilities   = PERF_PMU_CAP_NO_NMI,

    .event_init = perf_swevent_init,
    .add        = perf_swevent_add,
    .del        = perf_swevent_del,
    .start      = perf_swevent_start,
    .stop       = perf_swevent_stop,
    .read       = perf_swevent_read,

    .events_across_hotplug = 1,
};

对应初始化函数:

static int perf_swevent_init(struct perf_event *event)
{
    u64 event_id = event->attr.config;

    if (event->attr.type != PERF_TYPE_SOFTWARE)
        return -ENOENT;

    /*
     * no branch sampling for software events
     */
    if (has_branch_stack(event))
        return -EOPNOTSUPP;

    switch (event_id) {
    case PERF_COUNT_SW_CPU_CLOCK:
    case PERF_COUNT_SW_TASK_CLOCK:
        return -ENOENT;

    default:
        break;
    }

    if (event_id >= PERF_COUNT_SW_MAX)
        return -ENOENT;

    if (!event->parent) {
        int err;

        /* (1) 确保per_cpu的hash表已经创建 */
        err = swevent_hlist_get(event);
        if (err)
            return err;

        /* (2) 打开event_id对应event的开关 */
        static_key_slow_inc(&perf_swevent_enabled[event_id]);
        event->destroy = sw_perf_event_destroy;
    }

    return 0;
}

↓

static int swevent_hlist_get(struct perf_event *event)
{
    int err;
    int cpu, failed_cpu;

    get_online_cpus();
    for_each_possible_cpu(cpu) {
        err = swevent_hlist_get_cpu(event, cpu);
        if (err) {
            failed_cpu = cpu;
            goto fail;
        }
    }
    put_online_cpus();

    return 0;
fail:
    for_each_possible_cpu(cpu) {
        if (cpu == failed_cpu)
            break;
        swevent_hlist_put_cpu(event, cpu);
    }

    put_online_cpus();
    return err;
}

↓

static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
{
    struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
    int err = 0;

    mutex_lock(&swhash->hlist_mutex);

    /* (1.1) 确保per_cpu的hash表已经分配 */
    if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
        struct swevent_hlist *hlist;

        hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
        if (!hlist) {
            err = -ENOMEM;
            goto exit;
        }
        rcu_assign_pointer(swhash->swevent_hlist, hlist);
    }

    /* (1.2) 增加对hash表的引用 */
    swhash->hlist_refcount++;
exit:
    mutex_unlock(&swhash->hlist_mutex);

    return err;
}

这里的核心是一个默认256大小的hash链表,稍后会根据(type+config)的hash键值把event链接进去。

#define SWEVENT_HLIST_BITS      8
#define SWEVENT_HLIST_SIZE      (1 << SWEVENT_HLIST_BITS)

struct swevent_hlist {
    struct hlist_head       heads[SWEVENT_HLIST_SIZE];
    struct rcu_head         rcu_head;
};

2.2、“cpu-clock”

static struct pmu perf_cpu_clock = {
    .task_ctx_nr    = perf_sw_context,

    .capabilities   = PERF_PMU_CAP_NO_NMI,

    .event_init = cpu_clock_event_init,
    .add        = cpu_clock_event_add,
    .del        = cpu_clock_event_del,
    .start      = cpu_clock_event_start,
    .stop       = cpu_clock_event_stop,
    .read       = cpu_clock_event_read,

    .events_across_hotplug = 1,
};

对应初始化函数:

static int cpu_clock_event_init(struct perf_event *event)
{
    if (event->attr.type != PERF_TYPE_SOFTWARE)
        return -ENOENT;

    if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
        return -ENOENT;

    /*
     * no branch sampling for software events
     */
    if (has_branch_stack(event))
        return -EOPNOTSUPP;

    perf_swevent_init_hrtimer(event);

    return 0;
}

↓

static void perf_swevent_init_hrtimer(struct perf_event *event)
{
    struct hw_perf_event *hwc = &event->hw;

    if (!is_sampling_event(event))
        return;

    /* (1) 初始化对应hrtimer */
    hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    hwc->hrtimer.function = perf_swevent_hrtimer;

    /*
     * Since hrtimers have a fixed rate, we can do a static freq->period
     * mapping and avoid the whole period adjust feedback stuff.
     */
    /* (2) 如果是freq模式,根据freq计算period 
        在这种模式下->period_left不为负,等于->sample_period
     */
    if (event->attr.freq) {
        long freq = event->attr.sample_freq;

        event->attr.sample_period = NSEC_PER_SEC / freq;
        hwc->sample_period = event->attr.sample_period;
        local64_set(&hwc->period_left, hwc->sample_period);
        hwc->last_period = hwc->sample_period;
        event->attr.freq = 0;
    }
}

2.3、“task-clock”

static struct pmu perf_task_clock = {
    .task_ctx_nr    = perf_sw_context,

    .capabilities   = PERF_PMU_CAP_NO_NMI,

    .event_init = task_clock_event_init,
    .add        = task_clock_event_add,
    .del        = task_clock_event_del,
    .start      = task_clock_event_start,
    .stop       = task_clock_event_stop,
    .read       = task_clock_event_read,

    .events_across_hotplug = 1,
};

对应初始化函数:

static int task_clock_event_init(struct perf_event *event)
{
    if (event->attr.type != PERF_TYPE_SOFTWARE)
        return -ENOENT;

    if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
        return -ENOENT;

    /*
     * no branch sampling for software events
     */
    if (has_branch_stack(event))
        return -EOPNOTSUPP;

    perf_swevent_init_hrtimer(event);

    return 0;
}

3、event add/del

在上一章“perf_event内核框架”中已经阐明,task维度的perf_event需要和task一起调度,其回调函数最后控制的就是perf_event的启动和停止。

  • start函数调用路径:context_switch() -> finish_task_switch() -> perf_event_task_sched_in() -> __perf_event_task_sched_in() -> perf_event_context_sched_in() -> perf_event_sched_in() -> ctx_sched_in() -> ctx_pinned_sched_in()/ctx_flexible_sched_in() -> group_sched_in() -> event_sched_in() -> pmu->add(event, PERF_EF_START) -> xxx_add():

  • stop函数调用路径:context_switch() -> prepare_task_switch() -> perf_event_task_sched_out() -> __perf_event_task_sched_out() -> perf_event_context_sched_out() -> ctx_sched_out() -> group_sched_out() -> event_sched_out() -> pmu->del() -> xxx_del():

3.1、”software”

这里写图片描述

这种模式下,控制”software”event的启动和停止,实际上就是把event加入/移除到hash链表上。

  • perf_swevent_add():
static int perf_swevent_add(struct perf_event *event, int flags)
{
    struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
    struct hw_perf_event *hwc = &event->hw;
    struct hlist_head *head;

    /* (1) 重新计算period参数 */
    if (is_sampling_event(event)) {
        hwc->last_period = hwc->sample_period;
        perf_swevent_set_period(event);
    }

    hwc->state = !(flags & PERF_EF_START);

    /* (2) 根据event的(type+config)计算在hash表中的链表head */
    head = find_swevent_head(swhash, event);
    if (WARN_ON_ONCE(!head))
        return -EINVAL;

    /* (3) 将event加入链表 */
    hlist_add_head_rcu(&event->hlist_entry, head);
    perf_event_update_userpage(event);

    return 0;
}
  • perf_swevent_del():
static void perf_swevent_del(struct perf_event *event, int flags)
{
    /* (1) 将event从链表中移除 */
    hlist_del_rcu(&event->hlist_entry);
}

3.2、“cpu-clock”

每个event一个独立的hrtimer,所有就没有什么链表之类的了。

  • cpu_clock_event_add():
static int cpu_clock_event_add(struct perf_event *event, int flags)
{
    if (flags & PERF_EF_START)
        cpu_clock_event_start(event, flags);
    perf_event_update_userpage(event);

    return 0;
}

↓

static void cpu_clock_event_start(struct perf_event *event, int flags)
{
    /* (1) 记下当前的cpu time值 */
    local64_set(&event->hw.prev_count, local_clock());

    /* (2) 启动hrtimer */
    perf_swevent_start_hrtimer(event);
}

↓

static void perf_swevent_start_hrtimer(struct perf_event *event)
{
    struct hw_perf_event *hwc = &event->hw;
    s64 period;

    if (!is_sampling_event(event))
        return;

    /* (2.1) hrtimer的周期为10ms和->sample_period两者中的较大值 */
    period = local64_read(&hwc->period_left);
    if (period) {
        if (period < 0)
            period = 10000;

        local64_set(&hwc->period_left, 0);
    } else {
        period = max_t(u64, 10000, hwc->sample_period);
    }
    hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
              HRTIMER_MODE_REL_PINNED);
}

hrtimer到期后的执行函数为perf_swevent_hrtimer(),它会定期的上报sample数据。

static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
    enum hrtimer_restart ret = HRTIMER_RESTART;
    struct perf_sample_data data;
    struct pt_regs *regs;
    struct perf_event *event;
    u64 period;

    event = container_of(hrtimer, struct perf_event, hw.hrtimer);

    if (event->state != PERF_EVENT_STATE_ACTIVE)
        return HRTIMER_NORESTART;

    event->pmu->read(event);

    perf_sample_data_init(&data, 0, event->hw.last_period);
    regs = get_irq_regs();

    if (regs && !perf_exclude_event(event, regs)) {
        if (!(event->attr.exclude_idle && is_idle_task(current)))
            if (__perf_event_overflow(event, 1, &data, regs))
                ret = HRTIMER_NORESTART;
    }

    period = max_t(u64, 10000, event->hw.sample_period);
    hrtimer_forward_now(hrtimer, ns_to_ktime(period));

    return ret;
}
  • cpu_clock_event_del():
static void cpu_clock_event_del(struct perf_event *event, int flags)
{
    cpu_clock_event_stop(event, flags);
}

↓

static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
    /* (1) 停止hrtimer */
    perf_swevent_cancel_hrtimer(event);

    /* (2) 计算cpu time的差值,用来更新event的count值 */
    cpu_clock_event_update(event);
}

↓

static void cpu_clock_event_update(struct perf_event *event)
{
    s64 prev;
    u64 now;

    now = local_clock();
    prev = local64_xchg(&event->hw.prev_count, now);

    /* (2.1) 使用差值来更新count值 */
    local64_add(now - prev, &event->count);
}

3.3、“task-clock”

“task-clock”和“cpu-clock”基本一致,只是它计算的count值是event所在context time的差值。

  • task_clock_event_add():
static int task_clock_event_add(struct perf_event *event, int flags)
{
    if (flags & PERF_EF_START)
        task_clock_event_start(event, flags);
    perf_event_update_userpage(event);

    return 0;
}

↓

static void task_clock_event_start(struct perf_event *event, int flags)
{
    local64_set(&event->hw.prev_count, event->ctx->time);
    perf_swevent_start_hrtimer(event);
}
  • task_clock_event_del():
static void task_clock_event_del(struct perf_event *event, int flags)
{
    task_clock_event_stop(event, PERF_EF_UPDATE);
}

↓

static void task_clock_event_stop(struct perf_event *event, int flags)
{
    perf_swevent_cancel_hrtimer(event);
    task_clock_event_update(event, event->ctx->time);
}

↓

static void task_clock_event_update(struct perf_event *event, u64 now)
{
    u64 prev;
    s64 delta;

    prev = local64_xchg(&event->hw.prev_count, now);
    delta = now - prev;
    local64_add(delta, &event->count);
}

4、event 数据采集

4.1、”software”

在插桩点被触发后,累加count数据,并且上报sample数据。以“context-switches”为例:

context_switch() -> prepare_task_switch() -> perf_event_task_sched_out():

static inline void perf_event_task_sched_out(struct task_struct *prev,
                         struct task_struct *next)
{
    /* (1) “context-switches”的插桩点,被命中时上报数据 */
    perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);

    if (static_key_false(&perf_sched_events.key))
        __perf_event_task_sched_out(prev, next);
}

↓

static __always_inline void
perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
{
    /* (1.1) event开关在init时被使能 */
    if (static_key_false(&perf_swevent_enabled[event_id])) {
        struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);

        perf_fetch_caller_regs(regs);
        ___perf_sw_event(event_id, nr, regs, addr);
    }
}

↓

void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
    struct perf_sample_data data;

    if (WARN_ON_ONCE(!regs))
        return;

    perf_sample_data_init(&data, addr, 0);
    do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
}

↓

static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                    u64 nr,
                    struct perf_sample_data *data,
                    struct pt_regs *regs)
{
    struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
    struct perf_event *event;
    struct hlist_head *head;

    rcu_read_lock();

    /* (1.1.1) 找出pmu在hash表中的event链表 */
    head = find_swevent_head_rcu(swhash, type, event_id);
    if (!head)
        goto end;

    /* (1.1.2) 逐个向event发送数据 */
    hlist_for_each_entry_rcu(event, head, hlist_entry) {

        /* (1.1.3) 因为key(type+config)在hash中可能重复,再进行精确判断 */
        if (perf_swevent_match(event, type, event_id, data, regs))
            perf_swevent_event(event, nr, data, regs);
    }
end:
    rcu_read_unlock();
}

↓

static void perf_swevent_event(struct perf_event *event, u64 nr,
                   struct perf_sample_data *data,
                   struct pt_regs *regs)
{
    struct hw_perf_event *hwc = &event->hw;

    /* (1.1.2.1) 更新count数据 */
    local64_add(nr, &event->count);

    if (!regs)
        return;

    if (!is_sampling_event(event))
        return;

    /* (1.1.2.2) 上报采样数据,具体可以参考"tracepoint events"一章 */
    if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
        data->period = nr;
        return perf_swevent_overflow(event, 1, data, regs);
    } else
        data->period = event->hw.last_period;

    if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
        return perf_swevent_overflow(event, 1, data, regs);

    if (local64_add_negative(nr, &hwc->period_left))
        return;

    perf_swevent_overflow(event, 0, data, regs);
}

4.2、“cpu-clock”

  • 在del()操作时,计算count差值:

cpu_clock_event_del() -> cpu_clock_event_stop() -> cpu_clock_event_update():

static void cpu_clock_event_update(struct perf_event *event)
{
    s64 prev;
    u64 now;

    now = local_clock();
    prev = local64_xchg(&event->hw.prev_count, now);

    /* 增加count值 */
    local64_add(now - prev, &event->count);
}
  • 在hrtimer中,定时上报sample数据:
static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
    enum hrtimer_restart ret = HRTIMER_RESTART;
    struct perf_sample_data data;
    struct pt_regs *regs;
    struct perf_event *event;
    u64 period;

    event = container_of(hrtimer, struct perf_event, hw.hrtimer);

    if (event->state != PERF_EVENT_STATE_ACTIVE)
        return HRTIMER_NORESTART;

    event->pmu->read(event);

    perf_sample_data_init(&data, 0, event->hw.last_period);
    regs = get_irq_regs();

    if (regs && !perf_exclude_event(event, regs)) {
        if (!(event->attr.exclude_idle && is_idle_task(current)))

            /* 上报sample值 */
            if (__perf_event_overflow(event, 1, &data, regs))
                ret = HRTIMER_NORESTART;
    }

    period = max_t(u64, 10000, event->hw.sample_period);
    hrtimer_forward_now(hrtimer, ns_to_ktime(period));

    return ret;
}

4.3、“task-clock”

在del()操作时,计算count差值:

task_clock_event_del() -> task_clock_event_stop() -> task_clock_event_update():

static void task_clock_event_update(struct perf_event *event, u64 now)
{
    u64 prev;
    s64 delta;

    prev = local64_xchg(&event->hw.prev_count, now);
    delta = now - prev;

    /* 增加count值 */
    local64_add(delta, &event->count);
}

在hrtimer中,定时上报sample数据:

static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
    enum hrtimer_restart ret = HRTIMER_RESTART;
    struct perf_sample_data data;
    struct pt_regs *regs;
    struct perf_event *event;
    u64 period;

    event = container_of(hrtimer, struct perf_event, hw.hrtimer);

    if (event->state != PERF_EVENT_STATE_ACTIVE)
        return HRTIMER_NORESTART;

    event->pmu->read(event);

    perf_sample_data_init(&data, 0, event->hw.last_period);
    regs = get_irq_regs();

    if (regs && !perf_exclude_event(event, regs)) {
        if (!(event->attr.exclude_idle && is_idle_task(current)))

            /* 上报sample值 */
            if (__perf_event_overflow(event, 1, &data, regs))
                ret = HRTIMER_NORESTART;
    }

    period = max_t(u64, 10000, event->hw.sample_period);
    hrtimer_forward_now(hrtimer, ns_to_ktime(period));

    return ret;
}
2016-11-01 14:51:17 longwang155069 阅读数 2511

前言

通常新机制/事物的出现往往是解决某些问题的,同样wakeup events framework机制也不例外。先带大家了解下wakeup events framework出现的背景,然后在了解其内部的实现机制。
Linux系统中的电源管理一般是冷睡眠,而Android系统却将linux系统中的睡眠作为通常待机使用,显然Linux中的电源管理不符合Android系统。Android说既然不符合,我就给你改到符合,早期Android就提出了"wakelocks"机制,这种机制将Linux原生的睡眠唤醒流程改变,增加Android自己的处理函数,在一段时间这种机制可以解决Android上的省电,节能问题。但是有一种问题就是suspend和wakeup events之间的同步问题。当系统发生了suspend操作,系统会freeze process,  device prepared, device suspend,disabled irq等,这时候假设有wakeup events产生,而此时系统无法从suspend过程中唤醒。所以Linux在2.6.36中引入了wakeup events framework机制,用来解决suspend和wakeup events之间的同步问题。在Android4.4中,也去掉了之前的"wakelocks"机制,Andoird利用wakeup events framework重新设计了wakelocks,而上层API保持不变。

详细可参考:  http://lwn.net/Articles/388131/    或者https://lwn.net/Articles/416690/

数据结构

wakeup events framework代码在:  /kernel/drivers/base/power/wakeup.c中实现。在wakeup events framework中重要的数据结构就是wakeup_source,字面意思就是产生wakeup events的设备。

/**
 * struct wakeup_source - Representation of wakeup sources
 *
 * @total_time: Total time this wakeup source has been active.
 * @max_time: Maximum time this wakeup source has been continuously active.
 * @last_time: Monotonic clock when the wakeup source's was touched last time.
 * @prevent_sleep_time: Total time this source has been preventing autosleep.
 * @event_count: Number of signaled wakeup events.
 * @active_count: Number of times the wakeup source was activated.
 * @relax_count: Number of times the wakeup source was deactivated.
 * @expire_count: Number of times the wakeup source's timeout has expired.
 * @wakeup_count: Number of times the wakeup source might abort suspend.
 * @active: Status of the wakeup source.
 * @has_timeout: The wakeup source has been activated with a timeout.
 */
struct wakeup_source {
	const char 		*name;
	struct list_head	entry;
	spinlock_t		lock;
	struct timer_list	timer;
	unsigned long		timer_expires;
	ktime_t total_time;
	ktime_t max_time;
	ktime_t last_time;
	ktime_t start_prevent_time;
	ktime_t prevent_sleep_time;
	unsigned long		event_count;
	unsigned long		active_count;
	unsigned long		relax_count;
	unsigned long		expire_count;
	unsigned long		wakeup_count;
	bool			active:1;
	bool			autosleep_enabled:1;
};
.name:    唤醒源的名字。
.entry:     用来将唤醒源挂到链表上,用于管理。
.lock:       同步机制,用于访问链表时使用。
.timer:     定时器,用于设置该唤醒源的超时时间。
.timer_expires:  定时器的超时时间。
.total_time:  wakeup source处于active状态的总时间。
.max_time:  wakeup source处于active状态的最长时间。
.last_time:   wakeup source处于active状态的上次时间。
.start_prevent_time:   wakeup source阻止autosleep的开始时间。
.prevent_sleep_time:  wakeup source阻止autosleep的总时间。
.event_count:  wakeup source上报wakeup event的个数。
.active_count: wakeup source处于active状态的次数。
.relax_count:  wakeup source处于deactive状态的次数。
.expire_count:  wakeup source timeout次数。
.wakeup_count:  wakeup source abort睡眠的次数。
.active:  wakeup source的状态。
.autosleep_enabled:  autosleep使能的状态。

那到底什么是唤醒源呢? 在linux系统中,只有具有唤醒系统的设备才叫做“wakeup source”。 既然只有设备才能唤醒系统,那设备结构体struce device中就应该有某种标志代表此设备是否具有唤醒的能力。
struct device {
    ...
	struct dev_pm_info	power; 
	struct dev_pm_domain	*pm_domain;
	...
}
其中dev_pm_info代表该设备pm相关的详细信息。
struct dev_pm_info {
	pm_message_t		power_state;
	unsigned int		can_wakeup:1;
	unsigned int		async_suspend:1;
	
    ...
#ifdef CONFIG_PM_SLEEP
	struct list_head	entry;
	struct completion	completion;
	struct wakeup_source	*wakeup;
	bool			wakeup_path:1;
	bool			syscore:1;
#else
	unsigned int		should_wakeup:1;
#endif
	...
}
其中can_wakeup就代表该设备是否具有唤醒系统的能力。只有具有唤醒能力的device,在sys/devices/xxx/下就会存在power相关目录的。

Sys接口

为了方便查看系统的wakeup sources,linux系统在/sys/kernel/debug下创建了一个"wakeup_sources"文件,此文件记录了系统的唤醒源的详细信息。
static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
{
	struct wakeup_source *ws;

	seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
		"expire_count\tactive_since\ttotal_time\tmax_time\t"
		"last_change\tprevent_suspend_time\n");

	rcu_read_lock();
	list_for_each_entry_rcu(ws, &wakeup_sources, entry)
		print_wakeup_source_stats(m, ws);
	rcu_read_unlock();

	return 0;
}

static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
{
	return single_open(file, wakeup_sources_stats_show, NULL);
}

static const struct file_operations wakeup_sources_stats_fops = {
	.owner = THIS_MODULE,
	.open = wakeup_sources_stats_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int __init wakeup_sources_debugfs_init(void)
{
	wakeup_sources_stats_dentry = debugfs_create_file("wakeup_sources",
			S_IRUGO, NULL, NULL, &wakeup_sources_stats_fops);
	return 0;
}
以下是手机上的wakeup sources信息
root@test:/ # cat /sys/kernel/debug/wakeup_sources                    
name		active_count	event_count	wakeup_count	expire_count	active_since	total_time	max_time	last_change	prevent_suspend_time
event1      	40644		40644		0		0		0		31294		30		537054822		0
event4      	4496		4496		0		0		0		13369		22		20913677		0
event5      	4496		4496		0		0		0		13048		22		20913677		0
event0      	4540		4540		0		0		0		27995		277		258270184		0
eventpoll   	40688		54176		0		0		0		217		5		537054822		0
NETLINK     	2175		2175		0		0		0		16960		59		537058523		0
event_count:   代表wakeup source上报wakeup event的个数。
active_count:  当wakeup source产生wakeup events之后,wakup source的状态就处于active。但并不是每次都需要激活该wakup source,如果该wakeup source已经处于激活状态,则就不再需要激活。从一定角度可以说产生该wakup source设备的繁忙程度。
wakeup_count:  当系统在suspend的过程中,如果有wakeup source产生了wakup events事件,就会终止suspend的过程。该变量就记录了终止suspend的次数。

相关API

  • pm_stay_awake(有wakeup events产生后调用此函数通知PMcore)
void pm_stay_awake(struct device *dev)
{
	unsigned long flags;

	if (!dev)
		return;

	spin_lock_irqsave(&dev->power.lock, flags);
	__pm_stay_awake(dev->power.wakeup);
	spin_unlock_irqrestore(&dev->power.lock, flags);
该函数直接就调用__pm_stay_awake函数。此函数可以在中断上下文使用。
void __pm_stay_awake(struct wakeup_source *ws)
{
	unsigned long flags;

	if (!ws)
		return;

	spin_lock_irqsave(&ws->lock, flags);

	wakeup_source_report_event(ws);
	del_timer(&ws->timer);
	ws->timer_expires = 0;

	spin_unlock_irqrestore(&ws->lock, flags);
}
当wakeup source产生wakup events之后,调用pm_stay_awake函数上报wakeup events。随后会调用pm_relax函数,通知PM core wakeup events已经处理完毕。所以在__pm_stay_awake中不需要定时器。随后调用wakeup_source_report_event上报wakup events。
static void wakeup_source_report_event(struct wakeup_source *ws)
{
	ws->event_count++;
	/* This is racy, but the counter is approximate anyway. */
	if (events_check_enabled)
		ws->wakeup_count++;

	if (!ws->active)
		wakeup_source_activate(ws);
}
1.  wakeup events个数加1,也就是event_count加1。
2.  如果events_check_enabled设置了,则会终止系统suspend/hibernate,此时就需要将wakup_count加1, 代表阻止了suspend的次数。
/*
 * If set, the suspend/hibernate code will abort transitions to a sleep state
 * if wakeup events are registered during or immediately before the transition.
 */
bool events_check_enabled __read_mostly;
3. 如果wakup source没有激活的,激活该wakup source。假如已经处于active状态,则event_count就比active_count大。
static void wakeup_source_activate(struct wakeup_source *ws)
{
	unsigned int cec;

	/*
	 * active wakeup source should bring the system
	 * out of PM_SUSPEND_FREEZE state
	 */
	freeze_wake();

	ws->active = true;
	ws->active_count++;
	ws->last_time = ktime_get();
	if (ws->autosleep_enabled)
		ws->start_prevent_time = ws->last_time;

	/* Increment the counter of events in progress. */
	cec = atomic_inc_return(&combined_event_count);

	trace_wakeup_source_activate(ws->name, cec);
}
1.  调用freeze_wake将系统从FREEZE状态唤醒。
2.  更新wakeup source的active的状态。
3.  增加wakeup source的actice_count的引用计数。
4.  设置wakup source的last_time。
5.  如果autosleep enable,设置开始阻止的时间,因为从现在开始就阻止了autosleep。
6.  "wakeup events in progress"加1。"wakeup events in progress"代表系统中有wakeup events正在处理中,不为0,系统不能suspend。
  • pm_relax(唤醒事件处理完毕后,调用该函数通知PM core)
void pm_relax(struct device *dev)
{
	unsigned long flags;

	if (!dev)
		return;

	spin_lock_irqsave(&dev->power.lock, flags);
	__pm_relax(dev->power.wakeup);
	spin_unlock_irqrestore(&dev->power.lock, flags);
}
该函数也是直接调用__pm_relax函数。
void __pm_relax(struct wakeup_source *ws)
{
	unsigned long flags;

	if (!ws)
		return;

	spin_lock_irqsave(&ws->lock, flags);
	if (ws->active)
		wakeup_source_deactivate(ws);
	spin_unlock_irqrestore(&ws->lock, flags);
}
如果该wakeup source已经处于active状态,则调用wakeup_source_deactivate函数deactivce之。
static void wakeup_source_deactivate(struct wakeup_source *ws)
{
	unsigned int cnt, inpr, cec;
	ktime_t duration;
	ktime_t now;

	ws->relax_count++;
	/*
	 * __pm_relax() may be called directly or from a timer function.
	 * If it is called directly right after the timer function has been
	 * started, but before the timer function calls __pm_relax(), it is
	 * possible that __pm_stay_awake() will be called in the meantime and
	 * will set ws->active.  Then, ws->active may be cleared immediately
	 * by the __pm_relax() called from the timer function, but in such a
	 * case ws->relax_count will be different from ws->active_count.
	 */
	if (ws->relax_count != ws->active_count) {
		ws->relax_count--;
		return;
	}

	ws->active = false;

	now = ktime_get();
	duration = ktime_sub(now, ws->last_time);
	ws->total_time = ktime_add(ws->total_time, duration);
	if (ktime_to_ns(duration) > ktime_to_ns(ws->max_time))
		ws->max_time = duration;

	ws->last_time = now;
	del_timer(&ws->timer);
	ws->timer_expires = 0;

	if (ws->autosleep_enabled)
		update_prevent_sleep_time(ws, now);

	/*
	 * Increment the counter of registered wakeup events and decrement the
	 * couter of wakeup events in progress simultaneously.
	 */
	cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count);
	trace_wakeup_source_deactivate(ws->name, cec);

	split_counters(&cnt, &inpr);
	if (!inpr && waitqueue_active(&wakeup_count_wait_queue))
		wake_up(&wakeup_count_wait_queue);
}
1.  wakeup source的deactive状态加1, 也就是relax_count加1。
2.  将wakeup source的状态设置为false。
3.  计算wakeup event处理的时间,然后设置total time,  last_time,  max_time。
4.  如果autosleep使能,更新prevent_sleep_time
static void update_prevent_sleep_time(struct wakeup_source *ws, ktime_t now)
{
	ktime_t delta = ktime_sub(now, ws->start_prevent_time);
	ws->prevent_sleep_time = ktime_add(ws->prevent_sleep_time, delta);
}
5.  增加"registered wakeup events"同时减少“wakeup events in progress”。
6.  wakeup count相关的处理,留到wakeup count小节分析。总之简单理解就是激活active的反操作。
  • device_init_wakeup(wakeup source初始化操作,通常在设备驱动中使用该接口)
int device_init_wakeup(struct device *dev, bool enable)
{
	int ret = 0;

	if (!dev)
		return -EINVAL;

	if (enable) {
		device_set_wakeup_capable(dev, true);
		ret = device_wakeup_enable(dev);
	} else {
		if (dev->power.can_wakeup)
			device_wakeup_disable(dev);

		device_set_wakeup_capable(dev, false);
	}

	return ret;
}
1.  如果enable等于true,则设置device wakeup capability flag。然后enable wakeup source。
2.  如果enable等于false, 则disable wakeup source, 已经disable device wakeup capability flag。
  • device_set_wakeup_capable(设置device是否有将系统从sleep唤醒的能力)
void device_set_wakeup_capable(struct device *dev, bool capable)
{
	if (!!dev->power.can_wakeup == !!capable)
		return;

	if (device_is_registered(dev) && !list_empty(&dev->power.entry)) {
		if (capable) {
			if (wakeup_sysfs_add(dev))
				return;
		} else {
			wakeup_sysfs_remove(dev);
		}
	}
	dev->power.can_wakeup = capable;
}
如果capable等于ture, 则设置power.can_wakup的flag, 然后添加wakup属性到sys中。如果capable等于false,将wakeup属性从sys中移除,然后设置can_wakeup属性。wakup属性定义如下:
static struct attribute *wakeup_attrs[] = {
#ifdef CONFIG_PM_SLEEP
	&dev_attr_wakeup.attr,
	&dev_attr_wakeup_count.attr,
	&dev_attr_wakeup_active_count.attr,
	&dev_attr_wakeup_abort_count.attr,
	&dev_attr_wakeup_expire_count.attr,
	&dev_attr_wakeup_active.attr,
	&dev_attr_wakeup_total_time_ms.attr,
	&dev_attr_wakeup_max_time_ms.attr,
	&dev_attr_wakeup_last_time_ms.attr,
#ifdef CONFIG_PM_AUTOSLEEP
	&dev_attr_wakeup_prevent_sleep_time_ms.attr,
#endif
#endif
	NULL,
};
static struct attribute_group pm_wakeup_attr_group = {
	.name	= power_group_name,
	.attrs	= wakeup_attrs,
};
关于读取/设置wakeup的属性这里不再详细说明,前面已经见过很多次了。
  • device_wakeup_enable(enable device to be a wakeup source)
int device_wakeup_enable(struct device *dev)
{
	struct wakeup_source *ws;
	int ret;

	if (!dev || !dev->power.can_wakeup)
		return -EINVAL;

	ws = wakeup_source_register(dev_name(dev));
	if (!ws)
		return -ENOMEM;

	ret = device_wakeup_attach(dev, ws);
	if (ret)
		wakeup_source_unregister(ws);

	return ret;
}
1.  如果device不存在或者device不具有wakeup能力,则返回-EINVAL。
2.  创建wakeup source。注册wakeup source。
3.  将设备和wakeup source建立连接。如果失败,则释放wakeup source。
  • wakeup_source_register(分配一个唤醒源,将其加入到wakeup source链表中)
struct wakeup_source *wakeup_source_register(const char *name)
{
	struct wakeup_source *ws;

	ws = wakeup_source_create(name);
	if (ws)
		wakeup_source_add(ws);

	return ws;
}
1.  分配一个wakeup_source结构体,然后设置该wakeup source的name域。
struct wakeup_source *wakeup_source_create(const char *name)
{
	struct wakeup_source *ws;

	ws = kmalloc(sizeof(*ws), GFP_KERNEL);
	if (!ws)
		return NULL;

	wakeup_source_prepare(ws, name ? kstrdup(name, GFP_KERNEL) : NULL);
	return ws;
}
2. 添加给定的wakeup source到wakup source链表中。
void wakeup_source_add(struct wakeup_source *ws)
{
	unsigned long flags;

	if (WARN_ON(!ws))
		return;

	spin_lock_init(&ws->lock);
	setup_timer(&ws->timer, pm_wakeup_timer_fn, (unsigned long)ws);
	ws->active = false;
	ws->last_time = ktime_get();

	spin_lock_irqsave(&events_lock, flags);
	list_add_rcu(&ws->entry, &wakeup_sources);
	spin_unlock_irqrestore(&events_lock, flags);
}
其中还包括初始化spinlock,  建立定时器,然后将该wakeup source添加到wakeup_sources链表中。
次数,如果定时器设置的时间超时,则会调用定时器超时函数,在超时函数中deactive wakeup source, 然后超时count加1。
static void pm_wakeup_timer_fn(unsigned long data)
{
	struct wakeup_source *ws = (struct wakeup_source *)data;
	unsigned long flags;

	spin_lock_irqsave(&ws->lock, flags);

	if (ws->active && ws->timer_expires
	    && time_after_eq(jiffies, ws->timer_expires)) {
		wakeup_source_deactivate(ws);
		ws->expire_count++;
	}

	spin_unlock_irqrestore(&ws->lock, flags);
}
  • device_wakeup_attach(将wakeup source和device建立连接)
static int device_wakeup_attach(struct device *dev, struct wakeup_source *ws)
{
	spin_lock_irq(&dev->power.lock);
	if (dev->power.wakeup) {
		spin_unlock_irq(&dev->power.lock);
		return -EEXIST;
	}
	dev->power.wakeup = ws;
	spin_unlock_irq(&dev->power.lock);
	return 0;
}
如果此设备的wakeup source已经存在,则返回。如果没有则通过传入进来的wakeup source设置。
  • pm_wakeup_event(唤醒wakeup source, 在一段时间之后取消唤醒源)
void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
{
	unsigned long flags;
	unsigned long expires;

	if (!ws)
		return;

	spin_lock_irqsave(&ws->lock, flags);

	wakeup_source_report_event(ws);

	if (!msec) {
		wakeup_source_deactivate(ws);
		goto unlock;
	}

	expires = jiffies + msecs_to_jiffies(msec);
	if (!expires)
		expires = 1;

	if (!ws->timer_expires || time_after(expires, ws->timer_expires)) {
		mod_timer(&ws->timer, expires);
		ws->timer_expires = expires;
	}

 unlock:
	spin_unlock_irqrestore(&ws->lock, flags);
}
首先需要激活wakeup source, 然后如果超时时间是零,则立马又deactive wakeup source, 否则在一段时间之后deactive wakeup source。

示例分析

既然明白了wakeup events framework机制,那驱动程序中应该如何使用呢? 既然不知道如何使用,那就在kernel代码中寻找答案。

1.  一个设备既然要作用唤醒源,必须调用wakeup events framework提供的接口函数,而device_init_wakeup函数就具有此功能,而且还是外部的。在内核中搜索该函数的使用。这时候你会发现有好多处都调用此函数,则就可以顺着此思路探索下去。(kernel/drivers/input/keyboard/gpio-keys.c)

在probe函数中会设置workqueue,  设置timer,  设置wakeup source
INIT_WORK(&bdata->work, gpio_keys_gpio_work_func);
setup_timer(&bdata->timer,gpio_keys_gpio_timer, (unsigned long)bdata);
device_init_wakeup(&pdev->dev, wakeup);

2.  在key按下之后就会调用key的中断处理函数。
static irqreturn_t gpio_keys_gpio_isr(int irq, void *dev_id)
{
    ...

	if (bdata->button->wakeup)
		pm_stay_awake(bdata->input->dev.parent);
    ...
    
	return IRQ_HANDLED;
}
调用pm_stay_awake通知PM core,有wake events 产生,不能suspend。

3. 在定时器超时函数中调用workqueue, 然后在workqueue出处理按键事件,释放wake events
static void gpio_keys_gpio_work_func(struct work_struct *work)
{
    gpio_keys_gpio_report_event(bdata);

	if (bdata->button->wakeup)
		pm_relax(bdata->input->dev.parent);

     ...
}
只不过了每个驱动的调用接口不同罢了。


2017-03-01 11:35:30 chuanzhilong 阅读数 478

前言

通常新机制/事物的出现往往是解决某些问题的,同样wakeup events framework机制也不例外。先带大家了解下wakeup events framework出现的背景,然后在了解其内部的实现机制。
Linux系统中的电源管理一般是冷睡眠,而Android系统却将linux系统中的睡眠作为通常待机使用,显然Linux中的电源管理不符合Android系统。Android说既然不符合,我就给你改到符合,早期Android就提出了"wakelocks"机制,这种机制将Linux原生的睡眠唤醒流程改变,增加Android自己的处理函数,在一段时间这种机制可以解决Android上的省电,节能问题。但是有一种问题就是suspend和wakeup events之间的同步问题。当系统发生了suspend操作,系统会freeze process,  device prepared, device suspend,disabled irq等,这时候假设有wakeup events产生,而此时系统无法从suspend过程中唤醒。所以Linux在2.6.36中引入了wakeup events framework机制,用来解决suspend和wakeup events之间的同步问题。在Android4.4中,也去掉了之前的"wakelocks"机制,Andoird利用wakeup events framework重新设计了wakelocks,而上层API保持不变。

详细可参考:  http://lwn.net/Articles/388131/    或者https://lwn.net/Articles/416690/

数据结构

wakeup events framework代码在:  /kernel/drivers/base/power/wakeup.c中实现。在wakeup events framework中重要的数据结构就是wakeup_source,字面意思就是产生wakeup events的设备。

/**
 * struct wakeup_source - Representation of wakeup sources
 *
 * @total_time: Total time this wakeup source has been active.
 * @max_time: Maximum time this wakeup source has been continuously active.
 * @last_time: Monotonic clock when the wakeup source's was touched last time.
 * @prevent_sleep_time: Total time this source has been preventing autosleep.
 * @event_count: Number of signaled wakeup events.
 * @active_count: Number of times the wakeup source was activated.
 * @relax_count: Number of times the wakeup source was deactivated.
 * @expire_count: Number of times the wakeup source's timeout has expired.
 * @wakeup_count: Number of times the wakeup source might abort suspend.
 * @active: Status of the wakeup source.
 * @has_timeout: The wakeup source has been activated with a timeout.
 */
struct wakeup_source {
	const char 		*name;
	struct list_head	entry;
	spinlock_t		lock;
	struct timer_list	timer;
	unsigned long		timer_expires;
	ktime_t total_time;
	ktime_t max_time;
	ktime_t last_time;
	ktime_t start_prevent_time;
	ktime_t prevent_sleep_time;
	unsigned long		event_count;
	unsigned long		active_count;
	unsigned long		relax_count;
	unsigned long		expire_count;
	unsigned long		wakeup_count;
	bool			active:1;
	bool			autosleep_enabled:1;
};
.name:    唤醒源的名字。
.entry:     用来将唤醒源挂到链表上,用于管理。 
.lock:       同步机制,用于访问链表时使用。
.timer:     定时器,用于设置该唤醒源的超时时间。
.timer_expires:  定时器的超时时间。
.total_time:  wakeup source处于active状态的总时间。
.max_time:  wakeup source处于active状态的最长时间。
.last_time:   wakeup source处于active状态的上次时间。
.start_prevent_time:   wakeup source阻止autosleep的开始时间。
.prevent_sleep_time:  wakeup source阻止autosleep的总时间。
.event_count:  wakeup source上报wakeup event的个数。
.active_count: wakeup source处于active状态的次数。
.relax_count:  wakeup source处于deactive状态的次数。
.expire_count:  wakeup source timeout次数。
.wakeup_count:  wakeup source abort睡眠的次数。
.active:  wakeup source的状态。
.autosleep_enabled:  autosleep使能的状态。

那到底什么是唤醒源呢? 在linux系统中,只有具有唤醒系统的设备才叫做“wakeup source”。 既然只有设备才能唤醒系统,那设备结构体struce device中就应该有某种标志代表此设备是否具有唤醒的能力。
struct device {
    ...
	struct dev_pm_info	power; 
	struct dev_pm_domain	*pm_domain;
	...
}
其中dev_pm_info代表该设备pm相关的详细信息。
struct dev_pm_info {
	pm_message_t		power_state;
	unsigned int		can_wakeup:1;
	unsigned int		async_suspend:1;
	
    ...
#ifdef CONFIG_PM_SLEEP
	struct list_head	entry;
	struct completion	completion;
	struct wakeup_source	*wakeup;
	bool			wakeup_path:1;
	bool			syscore:1;
#else
	unsigned int		should_wakeup:1;
#endif
	...
}
其中can_wakeup就代表该设备是否具有唤醒系统的能力。只有具有唤醒能力的device,在sys/devices/xxx/下就会存在power相关目录的。

Sys接口

为了方便查看系统的wakeup sources,linux系统在/sys/kernel/debug下创建了一个"wakeup_sources"文件,此文件记录了系统的唤醒源的详细信息。
static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
{
	struct wakeup_source *ws;

	seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
		"expire_count\tactive_since\ttotal_time\tmax_time\t"
		"last_change\tprevent_suspend_time\n");

	rcu_read_lock();
	list_for_each_entry_rcu(ws, &wakeup_sources, entry)
		print_wakeup_source_stats(m, ws);
	rcu_read_unlock();

	return 0;
}

static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
{
	return single_open(file, wakeup_sources_stats_show, NULL);
}

static const struct file_operations wakeup_sources_stats_fops = {
	.owner = THIS_MODULE,
	.open = wakeup_sources_stats_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
};

static int __init wakeup_sources_debugfs_init(void)
{
	wakeup_sources_stats_dentry = debugfs_create_file("wakeup_sources",
			S_IRUGO, NULL, NULL, &wakeup_sources_stats_fops);
	return 0;
}
以下是手机上的wakeup sources信息
root@test:/ # cat /sys/kernel/debug/wakeup_sources                    
name		active_count	event_count	wakeup_count	expire_count	active_since	total_time	max_time	last_change	prevent_suspend_time
event1      	40644		40644		0		0		0		31294		30		537054822		0
event4      	4496		4496		0		0		0		13369		22		20913677		0
event5      	4496		4496		0		0		0		13048		22		20913677		0
event0      	4540		4540		0		0		0		27995		277		258270184		0
eventpoll   	40688		54176		0		0		0		217		5		537054822		0
NETLINK     	2175		2175		0		0		0		16960		59		537058523		0
event_count:   代表wakeup source上报wakeup event的个数。
active_count:  当wakeup source产生wakeup events之后,wakup source的状态就处于active。但并不是每次都需要激活该wakup source,如果该wakeup source已经处于激活状态,则就不再需要激活。从一定角度可以说产生该wakup source设备的繁忙程度。
wakeup_count:  当系统在suspend的过程中,如果有wakeup source产生了wakup events事件,就会终止suspend的过程。该变量就记录了终止suspend的次数。

相关API

  • pm_stay_awake(有wakeup events产生后调用此函数通知PMcore)
void pm_stay_awake(struct device *dev)
{
	unsigned long flags;

	if (!dev)
		return;

	spin_lock_irqsave(&dev->power.lock, flags);
	__pm_stay_awake(dev->power.wakeup);
	spin_unlock_irqrestore(&dev->power.lock, flags);
该函数直接就调用__pm_stay_awake函数。此函数可以在中断上下文使用。
void __pm_stay_awake(struct wakeup_source *ws)
{
	unsigned long flags;

	if (!ws)
		return;

	spin_lock_irqsave(&ws->lock, flags);

	wakeup_source_report_event(ws);
	del_timer(&ws->timer);
	ws->timer_expires = 0;

	spin_unlock_irqrestore(&ws->lock, flags);
}
当wakeup source产生wakup events之后,调用pm_stay_awake函数上报wakeup events。随后会调用pm_relax函数,通知PM core wakeup events已经处理完毕。所以在__pm_stay_awake中不需要定时器。随后调用wakeup_source_report_event上报wakup events。
static void wakeup_source_report_event(struct wakeup_source *ws)
{
	ws->event_count++;
	/* This is racy, but the counter is approximate anyway. */
	if (events_check_enabled)
		ws->wakeup_count++;

	if (!ws->active)
		wakeup_source_activate(ws);
}
1.  wakeup events个数加1,也就是event_count加1。
2.  如果events_check_enabled设置了,则会终止系统suspend/hibernate,此时就需要将wakup_count加1, 代表阻止了suspend的次数。
/*
 * If set, the suspend/hibernate code will abort transitions to a sleep state
 * if wakeup events are registered during or immediately before the transition.
 */
bool events_check_enabled __read_mostly;
3. 如果wakup source没有激活的,激活该wakup source。假如已经处于active状态,则event_count就比active_count大。
static void wakeup_source_activate(struct wakeup_source *ws)
{
	unsigned int cec;

	/*
	 * active wakeup source should bring the system
	 * out of PM_SUSPEND_FREEZE state
	 */
	freeze_wake();

	ws->active = true;
	ws->active_count++;
	ws->last_time = ktime_get();
	if (ws->autosleep_enabled)
		ws->start_prevent_time = ws->last_time;

	/* Increment the counter of events in progress. */
	cec = atomic_inc_return(&combined_event_count);

	trace_wakeup_source_activate(ws->name, cec);
}
1.  调用freeze_wake将系统从FREEZE状态唤醒。
2.  更新wakeup source的active的状态。
3.  增加wakeup source的actice_count的引用计数。
4.  设置wakup source的last_time。
5.  如果autosleep enable,设置开始阻止的时间,因为从现在开始就阻止了autosleep。
6.  "wakeup events in progress"加1。"wakeup events in progress"代表系统中有wakeup events正在处理中,不为0,系统不能suspend。
  • pm_relax(唤醒事件处理完毕后,调用该函数通知PM core)
void pm_relax(struct device *dev)
{
	unsigned long flags;

	if (!dev)
		return;

	spin_lock_irqsave(&dev->power.lock, flags);
	__pm_relax(dev->power.wakeup);
	spin_unlock_irqrestore(&dev->power.lock, flags);
}
该函数也是直接调用__pm_relax函数。
void __pm_relax(struct wakeup_source *ws)
{
	unsigned long flags;

	if (!ws)
		return;

	spin_lock_irqsave(&ws->lock, flags);
	if (ws->active)
		wakeup_source_deactivate(ws);
	spin_unlock_irqrestore(&ws->lock, flags);
}
如果该wakeup source已经处于active状态,则调用wakeup_source_deactivate函数deactivce之。
static void wakeup_source_deactivate(struct wakeup_source *ws)
{
	unsigned int cnt, inpr, cec;
	ktime_t duration;
	ktime_t now;

	ws->relax_count++;
	/*
	 * __pm_relax() may be called directly or from a timer function.
	 * If it is called directly right after the timer function has been
	 * started, but before the timer function calls __pm_relax(), it is
	 * possible that __pm_stay_awake() will be called in the meantime and
	 * will set ws->active.  Then, ws->active may be cleared immediately
	 * by the __pm_relax() called from the timer function, but in such a
	 * case ws->relax_count will be different from ws->active_count.
	 */
	if (ws->relax_count != ws->active_count) {
		ws->relax_count--;
		return;
	}

	ws->active = false;

	now = ktime_get();
	duration = ktime_sub(now, ws->last_time);
	ws->total_time = ktime_add(ws->total_time, duration);
	if (ktime_to_ns(duration) > ktime_to_ns(ws->max_time))
		ws->max_time = duration;

	ws->last_time = now;
	del_timer(&ws->timer);
	ws->timer_expires = 0;

	if (ws->autosleep_enabled)
		update_prevent_sleep_time(ws, now);

	/*
	 * Increment the counter of registered wakeup events and decrement the
	 * couter of wakeup events in progress simultaneously.
	 */
	cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count);
	trace_wakeup_source_deactivate(ws->name, cec);

	split_counters(&cnt, &inpr);
	if (!inpr && waitqueue_active(&wakeup_count_wait_queue))
		wake_up(&wakeup_count_wait_queue);
}
1.  wakeup source的deactive状态加1, 也就是relax_count加1。
2.  将wakeup source的状态设置为false。
3.  计算wakeup event处理的时间,然后设置total time,  last_time,  max_time。
4.  如果autosleep使能,更新prevent_sleep_time
static void update_prevent_sleep_time(struct wakeup_source *ws, ktime_t now)
{
	ktime_t delta = ktime_sub(now, ws->start_prevent_time);
	ws->prevent_sleep_time = ktime_add(ws->prevent_sleep_time, delta);
}
5.  增加"registered wakeup events"同时减少“wakeup events in progress”。
6.  wakeup count相关的处理,留到wakeup count小节分析。总之简单理解就是激活active的反操作。
  • device_init_wakeup(wakeup source初始化操作,通常在设备驱动中使用该接口)
int device_init_wakeup(struct device *dev, bool enable)
{
	int ret = 0;

	if (!dev)
		return -EINVAL;

	if (enable) {
		device_set_wakeup_capable(dev, true);
		ret = device_wakeup_enable(dev);
	} else {
		if (dev->power.can_wakeup)
			device_wakeup_disable(dev);

		device_set_wakeup_capable(dev, false);
	}

	return ret;
}
1.  如果enable等于true,则设置device wakeup capability flag。然后enable wakeup source。
2.  如果enable等于false, 则disable wakeup source, 已经disable device wakeup capability flag。
  • device_set_wakeup_capable(设置device是否有将系统从sleep唤醒的能力)
void device_set_wakeup_capable(struct device *dev, bool capable)
{
	if (!!dev->power.can_wakeup == !!capable)
		return;

	if (device_is_registered(dev) && !list_empty(&dev->power.entry)) {
		if (capable) {
			if (wakeup_sysfs_add(dev))
				return;
		} else {
			wakeup_sysfs_remove(dev);
		}
	}
	dev->power.can_wakeup = capable;
}
如果capable等于ture, 则设置power.can_wakup的flag, 然后添加wakup属性到sys中。如果capable等于false,将wakeup属性从sys中移除,然后设置can_wakeup属性。wakup属性定义如下:
static struct attribute *wakeup_attrs[] = {
#ifdef CONFIG_PM_SLEEP
	&dev_attr_wakeup.attr,
	&dev_attr_wakeup_count.attr,
	&dev_attr_wakeup_active_count.attr,
	&dev_attr_wakeup_abort_count.attr,
	&dev_attr_wakeup_expire_count.attr,
	&dev_attr_wakeup_active.attr,
	&dev_attr_wakeup_total_time_ms.attr,
	&dev_attr_wakeup_max_time_ms.attr,
	&dev_attr_wakeup_last_time_ms.attr,
#ifdef CONFIG_PM_AUTOSLEEP
	&dev_attr_wakeup_prevent_sleep_time_ms.attr,
#endif
#endif
	NULL,
};
static struct attribute_group pm_wakeup_attr_group = {
	.name	= power_group_name,
	.attrs	= wakeup_attrs,
};
关于读取/设置wakeup的属性这里不再详细说明,前面已经见过很多次了。
  • device_wakeup_enable(enable device to be a wakeup source)
int device_wakeup_enable(struct device *dev)
{
	struct wakeup_source *ws;
	int ret;

	if (!dev || !dev->power.can_wakeup)
		return -EINVAL;

	ws = wakeup_source_register(dev_name(dev));
	if (!ws)
		return -ENOMEM;

	ret = device_wakeup_attach(dev, ws);
	if (ret)
		wakeup_source_unregister(ws);

	return ret;
}
1.  如果device不存在或者device不具有wakeup能力,则返回-EINVAL。
2.  创建wakeup source。注册wakeup source。
3.  将设备和wakeup source建立连接。如果失败,则释放wakeup source。
  • wakeup_source_register(分配一个唤醒源,将其加入到wakeup source链表中)
struct wakeup_source *wakeup_source_register(const char *name)
{
	struct wakeup_source *ws;

	ws = wakeup_source_create(name);
	if (ws)
		wakeup_source_add(ws);

	return ws;
}
1.  分配一个wakeup_source结构体,然后设置该wakeup source的name域。
struct wakeup_source *wakeup_source_create(const char *name)
{
	struct wakeup_source *ws;

	ws = kmalloc(sizeof(*ws), GFP_KERNEL);
	if (!ws)
		return NULL;

	wakeup_source_prepare(ws, name ? kstrdup(name, GFP_KERNEL) : NULL);
	return ws;
}
2. 添加给定的wakeup source到wakup source链表中。
void wakeup_source_add(struct wakeup_source *ws)
{
	unsigned long flags;

	if (WARN_ON(!ws))
		return;

	spin_lock_init(&ws->lock);
	setup_timer(&ws->timer, pm_wakeup_timer_fn, (unsigned long)ws);
	ws->active = false;
	ws->last_time = ktime_get();

	spin_lock_irqsave(&events_lock, flags);
	list_add_rcu(&ws->entry, &wakeup_sources);
	spin_unlock_irqrestore(&events_lock, flags);
}
其中还包括初始化spinlock,  建立定时器,然后将该wakeup source添加到wakeup_sources链表中。
次数,如果定时器设置的时间超时,则会调用定时器超时函数,在超时函数中deactive wakeup source, 然后超时count加1。
static void pm_wakeup_timer_fn(unsigned long data)
{
	struct wakeup_source *ws = (struct wakeup_source *)data;
	unsigned long flags;

	spin_lock_irqsave(&ws->lock, flags);

	if (ws->active && ws->timer_expires
	    && time_after_eq(jiffies, ws->timer_expires)) {
		wakeup_source_deactivate(ws);
		ws->expire_count++;
	}

	spin_unlock_irqrestore(&ws->lock, flags);
}
  • device_wakeup_attach(将wakeup source和device建立连接)
static int device_wakeup_attach(struct device *dev, struct wakeup_source *ws)
{
	spin_lock_irq(&dev->power.lock);
	if (dev->power.wakeup) {
		spin_unlock_irq(&dev->power.lock);
		return -EEXIST;
	}
	dev->power.wakeup = ws;
	spin_unlock_irq(&dev->power.lock);
	return 0;
}
如果此设备的wakeup source已经存在,则返回。如果没有则通过传入进来的wakeup source设置。
  • pm_wakeup_event(唤醒wakeup source, 在一段时间之后取消唤醒源)
void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
{
	unsigned long flags;
	unsigned long expires;

	if (!ws)
		return;

	spin_lock_irqsave(&ws->lock, flags);

	wakeup_source_report_event(ws);

	if (!msec) {
		wakeup_source_deactivate(ws);
		goto unlock;
	}

	expires = jiffies + msecs_to_jiffies(msec);
	if (!expires)
		expires = 1;

	if (!ws->timer_expires || time_after(expires, ws->timer_expires)) {
		mod_timer(&ws->timer, expires);
		ws->timer_expires = expires;
	}

 unlock:
	spin_unlock_irqrestore(&ws->lock, flags);
}
首先需要激活wakeup source, 然后如果超时时间是零,则立马又deactive wakeup source, 否则在一段时间之后deactive wakeup source。

示例分析

既然明白了wakeup events framework机制,那驱动程序中应该如何使用呢? 既然不知道如何使用,那就在kernel代码中寻找答案。

1.  一个设备既然要作用唤醒源,必须调用wakeup events framework提供的接口函数,而device_init_wakeup函数就具有此功能,而且还是外部的。在内核中搜索该函数的使用。这时候你会发现有好多处都调用此函数,则就可以顺着此思路探索下去。(kernel/drivers/input/keyboard/gpio-keys.c)

在probe函数中会设置workqueue,  设置timer,  设置wakeup source
INIT_WORK(&bdata->work, gpio_keys_gpio_work_func);
setup_timer(&bdata->timer,gpio_keys_gpio_timer, (unsigned long)bdata);
device_init_wakeup(&pdev->dev, wakeup);

2.  在key按下之后就会调用key的中断处理函数。
static irqreturn_t gpio_keys_gpio_isr(int irq, void *dev_id)
{
    ...

	if (bdata->button->wakeup)
		pm_stay_awake(bdata->input->dev.parent);
    ...
    
	return IRQ_HANDLED;
}
调用pm_stay_awake通知PM core,有wake events 产生,不能suspend。

3. 在定时器超时函数中调用workqueue, 然后在workqueue出处理按键事件,释放wake events
static void gpio_keys_gpio_work_func(struct work_struct *work)
{
    gpio_keys_gpio_report_event(bdata);

	if (bdata->button->wakeup)
		pm_relax(bdata->input->dev.parent);

     ...
}
只不过了每个驱动的调用接口不同罢了。

转自:http://www.voidcn.com/blog/longwang155069/article/p-6255231.html
2018-07-26 15:44:34 pwl999 阅读数 410

perf复用了ftrace中trace_event的所有插桩点(即tracepoint),trace数据即可以通过ftrace的通道提供,也可以被包装成perf_event供perf工具使用。

可以通过perf list命令来查看系统中的tracpoint event,可以看到和/sys/kernel/debug/tracing/events/路径下的trace_event一致:

# simpleperf list tracepoint | more
List of tracepoint events:
  almk:almk_shrink
  almk:almk_vmpressure
  asoc:snd_soc_bias_level_done
  asoc:snd_soc_bias_level_start
  asoc:snd_soc_dapm_connected
  asoc:snd_soc_dapm_done
  asoc:snd_soc_dapm_path
  asoc:snd_soc_dapm_start
  asoc:snd_soc_dapm_walk_done
  asoc:snd_soc_dapm_widget_event_done
  asoc:snd_soc_dapm_widget_event_start
  asoc:snd_soc_dapm_widget_power

1、原理介绍:

trace_event在定义的时候,已经把trace_point、trace_event的callback函数、perf_event的callback函数全都给定义好了。只需要使用现成的架构把perf_event的callback函数给注册到trace_point上去就行了。

include/trace/trace_event.h:

/* (1) trace_event_class的定义 */
#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
_TRACE_PERF_PROTO(call, PARAMS(proto));                 \
static char print_fmt_##call[] = print;                 \
static struct trace_event_class __used __refdata event_class_##call = { \
    .system         = TRACE_SYSTEM_STRING,          \
    .define_fields      = trace_event_define_fields_##call, \
    .fields         = LIST_HEAD_INIT(event_class_##call.fields),\
    .raw_init       = trace_event_raw_init,         \

    /* (1.1) trace_event的回调函数 */
    .probe          = trace_event_raw_event_##call,     \

    /* (1.2) 注册函数,把callback函数加入到tracepoint中 */
    .reg            = trace_event_reg,          \

    /* (1.3) perf_event部分的定义 */
    _TRACE_PERF_INIT(call)                      \
};

/* (2) trace_event_call的定义 */
#undef DEFINE_EVENT
#define DEFINE_EVENT(template, call, proto, args)           \
                                    \
static struct trace_event_call __used event_##call = {          \

    /* (2.1) 对trace_event_class的引用 */
    .class          = &event_class_##template,      \

    /* (2.2) tracepoint的定义 */
    {                               \
        .tp         = &__tracepoint_##call,     \
    },                              \
    .event.funcs        = &trace_event_type_funcs_##template,   \
    .print_fmt      = print_fmt_##template,         \
    .flags          = TRACE_EVENT_FL_TRACEPOINT,        \
};                                  \
static struct trace_event_call __used                   \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call

_TRACE_PERF_INIT()宏的展开:

#ifdef CONFIG_PERF_EVENTS

#define _TRACE_PERF_PROTO(call, proto)                  \
    static notrace void                     \
    perf_trace_##call(void *__data, proto);

#define _TRACE_PERF_INIT(call)                      \

    /* (1.3) perf_event的回调函数:perf_trace_##call() */
    .perf_probe     = perf_trace_##call,

#else
#define _TRACE_PERF_PROTO(call, proto)
#define _TRACE_PERF_INIT(call)
#endif /* CONFIG_PERF_EVENTS */

继续追踪perf_event的回调函数,include/trace/perf.h:

#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
static notrace void                         \
perf_trace_##call(void *__data, proto)                  \
{                                   \
    struct trace_event_call *event_call = __data;           \
    struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
    struct trace_event_raw_##call *entry;               \
    struct pt_regs *__regs;                     \
    u64 __addr = 0, __count = 1;                    \
    struct task_struct *__task = NULL;              \
    struct hlist_head *head;                    \
    int __entry_size;                       \
    int __data_size;                        \
    int rctx;                           \
                                    \
    __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
                                    \
    head = this_cpu_ptr(event_call->perf_events);           \
    if (__builtin_constant_p(!__task) && !__task &&         \
                hlist_empty(head))          \
        return;                         \
                                    \
    /* (1) 计算需要存储的trace数据的长度 */
    __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
                 sizeof(u64));              \
    __entry_size -= sizeof(u32);                    \
                                    \

    /* (2) 分配存储数据的tmp buffer */
    entry = perf_trace_buf_prepare(__entry_size,            \
            event_call->event.type, &__regs, &rctx);    \
    if (!entry)                         \
        return;                         \
                                    \

    /* (3) get当前register */
    perf_fetch_caller_regs(__regs);                 \
                                    \

    /* (4) 给动态成员的占位符赋值:offset + size */
    tstruct                             \
                                    \

    /* (5) 记录trace数据到tmp buffer中 */
    { assign; }                         \
                                    \

    /* (6) 提交trace数据给连接到本tracepoint上的所有perf_event,拷贝tmp buffer到perf_event的ringbuffer中 */
    perf_trace_buf_submit(entry, __entry_size, rctx, __addr,    \
        __count, __regs, head, __task);             \
}

我们接下来看一下详细的过程和数据格式。

2、event init

tracepoint pmu的定义如下:

static struct pmu perf_tracepoint = {
    .task_ctx_nr    = perf_sw_context,

    .event_init = perf_tp_event_init,
    .add        = perf_trace_add,
    .del        = perf_trace_del,
    .start      = perf_swevent_start,
    .stop       = perf_swevent_stop,
    .read       = perf_swevent_read,

    .events_across_hotplug = 1,
};

在调用perf_event_open()系统调用创建新的perf_event的时候,会调用到pmu->event_init()函数。

perf_event_open() -> perf_event_alloc() -> perf_init_event() -> perf_try_init_event() -> pmu->event_init(event) -> perf_tp_event_init():

static int perf_tp_event_init(struct perf_event *event)
{
    int err;

    /* (1) attr.type类型不对,出错返回 */
    if (event->attr.type != PERF_TYPE_TRACEPOINT)
        return -ENOENT;

    /*
     * no branch sampling for tracepoint events
     */
    /* (2) tracepoint sample数据中不支持分支采样 */
    if (has_branch_stack(event))
        return -EOPNOTSUPP;

    /* (3) 继续init */
    err = perf_trace_init(event);
    if (err)
        return err;

    event->destroy = tp_perf_event_destroy;

    return 0;
}

↓

int perf_trace_init(struct perf_event *p_event)
{
    struct trace_event_call *tp_event;
    u64 event_id = p_event->attr.config;
    int ret = -EINVAL;

    mutex_lock(&event_mutex);

    /* (3.1) 遍历trace_event_call链表,找到type等于attr.config的trace_event_call */
    list_for_each_entry(tp_event, &ftrace_events, list) {
        if (tp_event->event.type == event_id &&
            tp_event->class && tp_event->class->reg &&
            try_module_get(tp_event->mod)) {

            /* (3.2) 继续初始化 */
            ret = perf_trace_event_init(tp_event, p_event);
            if (ret)
                module_put(tp_event->mod);
            break;
        }
    }
    mutex_unlock(&event_mutex);

    return ret;
}

↓

static int perf_trace_event_init(struct trace_event_call *tp_event,
                 struct perf_event *p_event)
{
    int ret;

    /* (3.2.1) 权限判断 */
    ret = perf_trace_event_perm(tp_event, p_event);
    if (ret)
        return ret;

    /* (3.2.2) register操作 */
    ret = perf_trace_event_reg(tp_event, p_event);
    if (ret)
        return ret;

    /* (3.2.3) open操作,对tracepoint来说这部分为空 */
    ret = perf_trace_event_open(p_event);
    if (ret) {
        perf_trace_event_unreg(p_event);
        return ret;
    }

    return 0;
}

↓

static int perf_trace_event_reg(struct trace_event_call *tp_event,
                struct perf_event *p_event)
{
    struct hlist_head __percpu *list;
    int ret = -ENOMEM;
    int cpu;

    p_event->tp_event = tp_event;
    if (tp_event->perf_refcount++ > 0)
        return 0;

    list = alloc_percpu(struct hlist_head);
    if (!list)
        goto fail;

    /* (3.2.2.1) 初始化trace_event_call上per_cpu的perf_event挂载链表:tp_event->perf_events 
        如果perf_event需要接收tracepoint的数据,需要按绑定的cpu挂载到对应的per_cpu链表上
     */
    for_each_possible_cpu(cpu)
        INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));

    tp_event->perf_events = list;

    /* (3.2.2.2) 分配tmp buffer空间,buffer都是per_cpu的每个cpu上都有
        context包括4种:task, softirq, hardirq, nmi.
     */
    if (!total_ref_count) {
        char __percpu *buf;
        int i;

        for (i = 0; i < PERF_NR_CONTEXTS; i++) {
            buf = (char __percpu *)alloc_percpu(perf_trace_t);
            if (!buf)
                goto fail;

            perf_trace_buf[i] = buf;
        }
    }

    /* (3.2.2.3) 调用class->reg(TRACE_REG_PERF_REGISTER)注册 */
    ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
    if (ret)
        goto fail;

    total_ref_count++;
    return 0;

fail:
    if (!total_ref_count) {
        int i;

        for (i = 0; i < PERF_NR_CONTEXTS; i++) {
            free_percpu(perf_trace_buf[i]);
            perf_trace_buf[i] = NULL;
        }
    }

    if (!--tp_event->perf_refcount) {
        free_percpu(tp_event->perf_events);
        tp_event->perf_events = NULL;
    }

    return ret;
}

tp_event->class->reg()函数在trace_event_class定义时,已经确定为trace_event_reg()函数:

int trace_event_reg(struct trace_event_call *call,
            enum trace_reg type, void *data)
{
    struct trace_event_file *file = data;

    WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
    switch (type) {

    /* (1) 把trace_event的callback函数注册到tracepoint上 */
    case TRACE_REG_REGISTER:
        return tracepoint_probe_register(call->tp,
                         call->class->probe,
                         file);
    case TRACE_REG_UNREGISTER:
        tracepoint_probe_unregister(call->tp,
                        call->class->probe,
                        file);
        return 0;

#ifdef CONFIG_PERF_EVENTS

    /* (2) 把perf_event的callback函数注册到tracepoint上 */
    case TRACE_REG_PERF_REGISTER:
        return tracepoint_probe_register(call->tp,
                         call->class->perf_probe,
                         call);
    case TRACE_REG_PERF_UNREGISTER:
        tracepoint_probe_unregister(call->tp,
                        call->class->perf_probe,
                        call);
        return 0;
    case TRACE_REG_PERF_OPEN:
    case TRACE_REG_PERF_CLOSE:
    case TRACE_REG_PERF_ADD:
    case TRACE_REG_PERF_DEL:
        return 0;
#endif
    }
    return 0;
}

经过上述初始化注册以后,perf_event的callback函数已经插入到tracepoint中。此时如果tracepoint被命中,perf_event的callback函数会被调用,但是trace数据无人接收,因为tp_event->perf_events链表中还没有接收的perf_event。

3、event add/del

这里写图片描述

承接上一步perf_event注册以后,perf_event的回调函数已经可以被tracepoint调用,但是perf_event仍然收不到数据。因为这个时候perf_event还处于enable/inactive状态,需要处于active状态才能收到数据。

一个tracepoint会给多个perf_event提供数据,只有将perf_event加入到tp_event->perf_events per_cpu链表中才能收到数据。所以对tracepoint perf_event的start/stop就是操作tp_event->perf_events per_cpu链表。

在上一章“perf_event内核框架”中已经阐明,task维度的perf_event需要和task一起调度,其回调函数最后控制的就是perf_event的启动和停止。

context_switch() -> finish_task_switch() -> perf_event_task_sched_in() -> __perf_event_task_sched_in() -> perf_event_context_sched_in() -> perf_event_sched_in() -> ctx_sched_in() -> ctx_pinned_sched_in()/ctx_flexible_sched_in() -> group_sched_in() -> event_sched_in() -> pmu->add(event, PERF_EF_START) -> perf_trace_add():

int perf_trace_add(struct perf_event *p_event, int flags)
{
    struct trace_event_call *tp_event = p_event->tp_event;
    struct hlist_head __percpu *pcpu_list;
    struct hlist_head *list;

    pcpu_list = tp_event->perf_events;
    if (WARN_ON_ONCE(!pcpu_list))
        return -EINVAL;

    if (!(flags & PERF_EF_START))
        p_event->hw.state = PERF_HES_STOPPED;

    /* (1) 将perf_event加入到tp_event->perf_events的当前cpu链表中 */
    list = this_cpu_ptr(pcpu_list);
    hlist_add_head_rcu(&p_event->hlist_entry, list);

    /* (2) 空操作 */
    return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
}

context_switch() -> prepare_task_switch() -> perf_event_task_sched_out() -> __perf_event_task_sched_out() -> perf_event_context_sched_out() -> ctx_sched_out() -> group_sched_out() -> event_sched_out() -> pmu->del() -> perf_trace_del():

void perf_trace_del(struct perf_event *p_event, int flags)
{
    struct trace_event_call *tp_event = p_event->tp_event;

    /* (1) 将perf_event从tp_event->perf_events的链表中删除 */
    if (!hlist_unhashed(&p_event->hlist_entry))
        hlist_del_rcu(&p_event->hlist_entry);

    /* (2) 空操作 */
    tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}

4、event 数据采集

perf_event的主要功能就是提供count类型或者sample类型的trace数据。

tracepoint pmu提供trace数据的方式是被动的:tracepoint被命中后,perf callback函数被调用后,累加perf_event的count值,如果perf_event分配了ringbuffer记录sample数据。

4.1、count数据

我们在定义用户trace_event时使用__perf_count()宏指定每次tracepoint命中count的累加值。例如:

DECLARE_EVENT_CLASS(sched_stat_runtime,

    TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),

    /* (1) 指定每次命中,count累加runtime */
    TP_ARGS(tsk, __perf_count(runtime), vruntime),

    TP_STRUCT__entry(
        __array( char,  comm,   TASK_COMM_LEN   )
        __field( pid_t, pid         )
        __field( u64,   runtime         )
        __field( u64,   vruntime            )
    ),

    TP_fast_assign(
        memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
        __entry->pid        = tsk->pid;
        __entry->runtime    = runtime;
        __entry->vruntime   = vruntime;
    ),

    TP_printk("comm=%s pid=%d runtime=%Lu [ns] vruntime=%Lu [ns]",
            __entry->comm, __entry->pid,
            (unsigned long long)__entry->runtime,
            (unsigned long long)__entry->vruntime)
);

如果没有使用__perf_count()宏定义,默认每次tracepoint命中count加1。

看看具体的代码:

/* (1.1) __perf_addr()宏,指定sample时PERF_SAMPLE_ADDR的值 */
#undef __perf_addr
#define __perf_addr(a)  (__addr = (a))

/* (1.2) __perf_count()宏,指定count的累加值 */
#undef __perf_count
#define __perf_count(c) (__count = (c))

/* (1.3) __perf_task()宏,指定由某个task上绑定的perf_event来接收trace数据 */
#undef __perf_task
#define __perf_task(t)  (__task = (t))

#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
static notrace void                         \
perf_trace_##call(void *__data, proto)                  \
{                                   \
    struct trace_event_call *event_call = __data;           \
    struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
    struct trace_event_raw_##call *entry;               \
    struct pt_regs *__regs;                     \

    /* (1.4) 如果没有使用宏指定,默认值:
        __addr = 0
        __count = 1
        __task = NULL
     */
    u64 __addr = 0, __count = 1;                    \
    struct task_struct *__task = NULL;              \
    struct hlist_head *head;                    \
    int __entry_size;                       \
    int __data_size;                        \
    int rctx;                           \
                                    \
    /* (2.1) 获取动态成员的长度 */
    __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
                                    \
    /* (3) 获取tracepoint的本cpu perf_event链表,需要本cpu tracepoint数据的perf_event都链接于此 */
    head = this_cpu_ptr(event_call->perf_events);           \
    if (__builtin_constant_p(!__task) && !__task &&         \
                hlist_empty(head))          \
        return;                         \
                                    \

    /* (2.2) 计算需要存储的raw data的总长度:动态长度 + 固定成员长度 */
    __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
                 sizeof(u64));              \
    __entry_size -= sizeof(u32);                    \
                                    \
    /* (4) 根据长度获得tmp buffer 
        并得到当前的context:task, softirq, hardirq, nmi.
     */
    entry = perf_trace_buf_prepare(__entry_size,            \
            event_call->event.type, &__regs, &rctx);    \
    if (!entry)                         \
        return;                         \
                                    \

    /* (5) get当前register */
    perf_fetch_caller_regs(__regs);                 \
                                    \

    /* (6) 给动态成员的占位符赋值:offset + size */
    tstruct                             \
                                    \

    /* (7) 记录trace数据到tmp buffer中 */
    { assign; }                         \
                                    \

    /* (6) 提交raw data数据给连接到本tracepoint上的所有perf_event,拷贝tmp buffer到perf_event的ringbuffer中 
        注意:这里的数据只是raw data数据,整个perf的sample数据还包括其他一些数据
     */
    perf_trace_buf_submit(entry, __entry_size, rctx, __addr,    \
        __count, __regs, head, __task);             \
}

perf_trace_buf_prepare()函数的作用是获取一块tmp buffer,为了避免互斥根据当前的context(task, softirq, hardirq, nmi)和当前cpu,从per_cpu变量中分配到一块对应的buffer:

void *perf_trace_buf_prepare(int size, unsigned short type,
                 struct pt_regs **regs, int *rctxp)
{
    struct trace_entry *entry;
    unsigned long flags;
    char *raw_data;
    int pc;

    BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));

    if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
            "perf buffer not large enough"))
        return NULL;

    pc = preempt_count();

    /* (4.1) 获取当前的context,并且防止递归进入 */
    *rctxp = perf_swevent_get_recursion_context();
    if (*rctxp < 0)
        return NULL;

    /* (4.2) 根据context。获取到保存reg的tmp buffer */
    if (regs)
        *regs = this_cpu_ptr(&__perf_regs[*rctxp]);

    /* (4.3) 根据context。获取到保存trace data的tmp buffer */
    raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);

    /* zero the dead bytes from align to not leak stack to user */
    memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));

    entry = (struct trace_entry *)raw_data;
    local_save_flags(flags);

    /* (4.4) 给tmp trace data buffer中的comm字段赋值 */
    tracing_generic_entry_update(entry, flags, pc);
    entry->type = type;

    return raw_data;
}

↓

int perf_swevent_get_recursion_context(void)
{
    struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);

    return get_recursion_context(swhash->recursion);
}

↓

static inline int get_recursion_context(int *recursion)
{
    int rctx;

    if (in_nmi())
        rctx = 3;
    else if (in_irq())
        rctx = 2;
    else if (in_softirq())
        rctx = 1;
    else
        rctx = 0;

    if (recursion[rctx])
        return -1;

    recursion[rctx]++;
    barrier();

    return rctx;
}

perf_trace_buf_submit()负责提交数据给this_cpu_ptr(event_call->perf_events)链表上等待的perf_event:

static inline void
perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
               u64 count, struct pt_regs *regs, void *head,
               struct task_struct *task)
{
    perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
}

↓

void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
           struct pt_regs *regs, struct hlist_head *head, int rctx,
           struct task_struct *task)
{
    struct perf_sample_data data;
    struct perf_event *event;

    /* (6.1) 存放raw data */
    struct perf_raw_record raw = {
        .size = entry_size,
        .data = record,
    };

    /* (6.2) 初始化sample data */
    perf_sample_data_init(&data, addr, 0);
    data.raw = &raw;

    /* (6.3) 把sample数据逐个发送给this_cpu_ptr(event_call->perf_events)链表上链接的perf_event */
    hlist_for_each_entry_rcu(event, head, hlist_entry) {
        if (perf_tp_event_match(event, &data, regs))
            perf_swevent_event(event, count, &data, regs);
    }

    /*
     * If we got specified a target task, also iterate its context and
     * deliver this event there too.
     */
    /* (6.4) __perf_task()宏,指定由某个task上绑定的perf_event来接收sample数据 */
    if (task && task != current) {
        struct perf_event_context *ctx;
        struct trace_entry *entry = record;

        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
        if (!ctx)
            goto unlock;

        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
            if (event->attr.type != PERF_TYPE_TRACEPOINT)
                continue;
            if (event->attr.config != entry->type)
                continue;
            if (perf_tp_event_match(event, &data, regs))
                perf_swevent_event(event, count, &data, regs);
        }
unlock:
        rcu_read_unlock();
    }

    perf_swevent_put_recursion_context(rctx);
}

↓

static void perf_swevent_event(struct perf_event *event, u64 nr,
                   struct perf_sample_data *data,
                   struct pt_regs *regs)
{
    struct hw_perf_event *hwc = &event->hw;

    /* (6.3.1) 重点:
        perf_event的count值在这里累加
     */
    local64_add(nr, &event->count);

    if (!regs)
        return;

    /* (6.3.2) 没有指定sample方式的perf_event,不需要sample数据,直接返回 */
    if (!is_sampling_event(event))
        return;

    if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
        data->period = nr;
        return perf_swevent_overflow(event, 1, data, regs);
    } else
        data->period = event->hw.last_period;

    if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
        return perf_swevent_overflow(event, 1, data, regs);

    if (local64_add_negative(nr, &hwc->period_left))
        return;

    perf_swevent_overflow(event, 0, data, regs);
}

4.2、sample数据

我们来继续分析sample类型数据的上报。有两种上报方式:period、freq。

  • period上报方式:以每period间隔的方式上报。有3个相关的参数:

    event->hw.last_period // 当前的period值
    event->hw.period_left // 到period一个周期还剩多久,这是一个负值,取值范围(-last_period, 0)
    event->hw.sample_period // 最新的period值,只有当上一周期执行完后,才能更新:last_period = sample_period

static void perf_swevent_event(struct perf_event *event, u64 nr,
                   struct perf_sample_data *data,
                   struct pt_regs *regs)
{
    struct hw_perf_event *hwc = &event->hw;

    /* (1) 重点:
        perf_event的count值在这里累加
     */
    local64_add(nr, &event->count);

    if (!regs)
        return;

    /* (2) 没有指定sample方式的perf_event,不需要sample数据,直接返回 */
    if (!is_sampling_event(event))
        return;

    /* (3.1) 如果指定PERF_SAMPLE_PERIOD,且是period模式 
        不理period的限定,上报sample数据和当前data->period数据
        perf_swevent_overflow()的overflow参数 = 1,说明不理period的时间限制
     */
    if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
        data->period = nr;
        return perf_swevent_overflow(event, 1, data, regs);
    } else
        data->period = event->hw.last_period;

    /* (3.2) 如果hwc->sample_period == 1,且是period模式 
        每次都是上报,不需要重新设置period,所以overflow参数 = 1
     */
    if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
        return perf_swevent_overflow(event, 1, data, regs);

    /* (4.1) 判断period有没有到期,没有到期的数据丢弃返回。 */
    if (local64_add_negative(nr, &hwc->period_left))
        return;

    /* (4.2) period到期,上报数据。 */
    perf_swevent_overflow(event, 0, data, regs);
}

↓

static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
                    struct perf_sample_data *data,
                    struct pt_regs *regs)
{
    struct hw_perf_event *hwc = &event->hw;
    int throttle = 0;

    /* (4.2.1) overflow=0,说明是period到期后进来的,需要重新设置period 
        执行perf_swevent_set_period()返回的overflow是距离上一次上报经历了多少个period
     */
    if (!overflow)
        overflow = perf_swevent_set_period(event);

    if (hwc->interrupts == MAX_INTERRUPTS)
        return;

    /* (4.2.2) overflow是上一次经历了多少个period,或者是overflow=1强制进来的
        每个period上报一次数据
     */
    for (; overflow; overflow--) {
        if (__perf_event_overflow(event, throttle,
                        data, regs)) {
            /*
             * We inhibit the overflow from happening when
             * hwc->interrupts == MAX_INTERRUPTS.
             */
            break;
        }
        throttle = 1;
    }
}

↓

static int __perf_event_overflow(struct perf_event *event,
                   int throttle, struct perf_sample_data *data,
                   struct pt_regs *regs)
{
    int events = atomic_read(&event->event_limit);
    struct hw_perf_event *hwc = &event->hw;
    u64 seq;
    int ret = 0;

    /*
     * Non-sampling counters might still use the PMI to fold short
     * hardware counters, ignore those.
     */
    /* (4.2.2.1) 没有sample数据上报的返回 */
    if (unlikely(!is_sampling_event(event)))
        return 0;

    /* (4.2.2.2) throttle的判断,还没看懂? */
    seq = __this_cpu_read(perf_throttled_seq);
    if (seq != hwc->interrupts_seq) {
        hwc->interrupts_seq = seq;
        hwc->interrupts = 1;
    } else {
        hwc->interrupts++;
        if (unlikely(throttle
                 && hwc->interrupts >= max_samples_per_tick)) {
            __this_cpu_inc(perf_throttled_count);
            hwc->interrupts = MAX_INTERRUPTS;
            perf_log_throttle(event, 0);
            tick_nohz_full_kick();
            ret = 1;
        }
    }

    /* (4.2.2.4) 如果是freq模式,根据count的发生频率,重新计算period值 */
    if (event->attr.freq) {
        u64 now = perf_clock();
        s64 delta = now - hwc->freq_time_stamp;

        hwc->freq_time_stamp = now;

        if (delta > 0 && delta < 2*TICK_NSEC)
            perf_adjust_period(event, delta, hwc->last_period, true);
    }

    /*
     * XXX event_limit might not quite work as expected on inherited
     * events
     */

    /* (4.2.2.5) perf_event pending的处理,没看懂? */
    event->pending_kill = POLL_IN;
    if (events && atomic_dec_and_test(&event->event_limit)) {
        ret = 1;
        event->pending_kill = POLL_HUP;
        event->pending_disable = 1;
        irq_work_queue(&event->pending);
    }

    /* (4.2.2.6) 实际上报sample数据的地方了:
        调用event自己的overflow_handler()函数,或者默认perf_event_output()函数
     */
    if (event->overflow_handler)
        event->overflow_handler(event, data, regs);
    else
        perf_event_output(event, data, regs);

    if (*perf_event_fasync(event) && event->pending_kill) {
        event->pending_wakeup = 1;
        irq_work_queue(&event->pending);
    }

    return ret;
}

perf_event_output()是默认的上报sample数据函数:

void perf_event_output(struct perf_event *event,
            struct perf_sample_data *data,
            struct pt_regs *regs)
{
    struct perf_output_handle handle;
    struct perf_event_header header;

    /* protect the callchain buffers */
    rcu_read_lock();

    /* (1) 计算sample数据的总长度,并且根据attr.sample_type保存额外的数据 */
    perf_prepare_sample(&header, data, event, regs);

    /* (2) 根据总size在event的ringbuffer中分配空间 */
    if (perf_output_begin(&handle, event, header.size))
        goto exit;

    /* (3) 输出数据到ringbuffer中 */
    perf_output_sample(&handle, &header, data, event);

    /* (4) 把rb->head指针刷入rb->user_page->data_head */
    perf_output_end(&handle);

exit:
    rcu_read_unlock();
}

|→

int perf_output_begin(struct perf_output_handle *handle,
              struct perf_event *event, unsigned int size)
{
    struct ring_buffer *rb;
    unsigned long tail, offset, head;
    int have_lost, page_shift;
    struct {
        struct perf_event_header header;
        u64          id;
        u64          lost;
    } lost_event;

    rcu_read_lock();
    /*
     * For inherited events we send all the output towards the parent.
     */
    /* (2.1) 由于inherit创建的子进程,都使用父进程的ringbuffer */
    if (event->parent)
        event = event->parent;

    rb = rcu_dereference(event->rb);
    if (unlikely(!rb))
        goto out;

    if (unlikely(!rb->nr_pages))
        goto out;

    handle->rb    = rb;
    handle->event = event;

    have_lost = local_read(&rb->lost);
    if (unlikely(have_lost)) {
        size += sizeof(lost_event);
        if (event->attr.sample_id_all)
            size += event->id_header_size;
    }

    perf_output_get_handle(handle);

    /* (2.2) 在ringbuffer中寻找一块空间 */
    do {
        /* (2.2.1) tail指针从user_page中获取,head指针从rb结构中获取
            只有把数据写入确认写入以后,才会把head指针的改动从rb结构刷入user_page
         */
        tail = READ_ONCE(rb->user_page->data_tail);
        offset = head = local_read(&rb->head);
        if (!rb->overwrite &&
            unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
            goto fail;

        /*
         * The above forms a control dependency barrier separating the
         * @tail load above from the data stores below. Since the @tail
         * load is required to compute the branch to fail below.
         *
         * A, matches D; the full memory barrier userspace SHOULD issue
         * after reading the data and before storing the new tail
         * position.
         *
         * See perf_output_put_handle().
         */

        head += size;
    } while (local_cmpxchg(&rb->head, offset, head) != offset);

    /*
     * We rely on the implied barrier() by local_cmpxchg() to ensure
     * none of the data stores below can be lifted up by the compiler.
     */

    if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
        local_add(rb->watermark, &rb->wakeup);

    page_shift = PAGE_SHIFT + page_order(rb);

    /* (2.3) 计算分配得到的buffer的第一个page偏移和size */
    handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
    offset &= (1UL << page_shift) - 1;
    handle->addr = rb->data_pages[handle->page] + offset;
    handle->size = (1UL << page_shift) - offset;

    if (unlikely(have_lost)) {
        struct perf_sample_data sample_data;

        lost_event.header.size = sizeof(lost_event);
        lost_event.header.type = PERF_RECORD_LOST;
        lost_event.header.misc = 0;
        lost_event.id          = event->id;
        lost_event.lost        = local_xchg(&rb->lost, 0);

        perf_event_header__init_id(&lost_event.header,
                       &sample_data, event);
        perf_output_put(handle, lost_event);
        perf_event__output_id_sample(event, handle, &sample_data);
    }

    return 0;

fail:
    local_inc(&rb->lost);
    perf_output_put_handle(handle);
out:
    rcu_read_unlock();

    return -ENOSPC;
}

|→

void perf_output_end(struct perf_output_handle *handle)
{
    perf_output_put_handle(handle);
    rcu_read_unlock();
}

↓

static void perf_output_put_handle(struct perf_output_handle *handle)
{
    struct ring_buffer *rb = handle->rb;
    unsigned long head;

again:
    head = local_read(&rb->head);

    /*
     * IRQ/NMI can happen here, which means we can miss a head update.
     */

    if (!local_dec_and_test(&rb->nest))
        goto out;

    /*
     * Since the mmap() consumer (userspace) can run on a different CPU:
     *
     *   kernel             user
     *
     *   if (LOAD ->data_tail) {        LOAD ->data_head
     *          (A)     smp_rmb()   (C)
     *  STORE $data			LOAD $data
     *  smp_wmb()   (B)     smp_mb()    (D)
     *  STORE ->data_head       STORE ->data_tail
     *   }
     *
     * Where A pairs with D, and B pairs with C.
     *
     * In our case (A) is a control dependency that separates the load of
     * the ->data_tail and the stores of $data. In case ->data_tail
     * indicates there is no room in the buffer to store $data we do not.
     *
     * D needs to be a full barrier since it separates the data READ
     * from the tail WRITE.
     *
     * For B a WMB is sufficient since it separates two WRITEs, and for C
     * an RMB is sufficient since it separates two READs.
     *
     * See perf_output_begin().
     */
    smp_wmb(); /* B, matches C */
    rb->user_page->data_head = head;

    /*
     * Now check if we missed an update -- rely on previous implied
     * compiler barriers to force a re-read.
     */
    if (unlikely(head != local_read(&rb->head))) {
        local_inc(&rb->nest);
        goto again;
    }

    if (handle->wakeup != local_read(&rb->wakeup))
        perf_output_wakeup(handle);

out:
    preempt_enable();
}

这里写图片描述

perf_event_type中详细描述了各种type类型的sample数据排列模式:

enum perf_event_type {

    /*
     * If perf_event_attr.sample_id_all is set then all event types will
     * have the sample_type selected fields related to where/when
     * (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU,
     * IDENTIFIER) described in PERF_RECORD_SAMPLE below, it will be stashed
     * just after the perf_event_header and the fields already present for
     * the existing fields, i.e. at the end of the payload. That way a newer
     * perf.data file will be supported by older perf tools, with these new
     * optional fields being ignored.
     *
     * struct sample_id {
     *  { u32           pid, tid; } && PERF_SAMPLE_TID
     *  { u64           time;     } && PERF_SAMPLE_TIME
     *  { u64           id;       } && PERF_SAMPLE_ID
     *  { u64           stream_id;} && PERF_SAMPLE_STREAM_ID
     *  { u32           cpu, res; } && PERF_SAMPLE_CPU
     *  { u64           id;   } && PERF_SAMPLE_IDENTIFIER
     * } && perf_event_attr::sample_id_all
     *
     * Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID.  The
     * advantage of PERF_SAMPLE_IDENTIFIER is that its position is fixed
     * relative to header.size.
     */

    /*
     * The MMAP events record the PROT_EXEC mappings so that we can
     * correlate userspace IPs to code. They have the following structure:
     *
     * struct {
     *  struct perf_event_header    header;
     *
     *  u32             pid, tid;
     *  u64             addr;
     *  u64             len;
     *  u64             pgoff;
     *  char                filename[];
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_MMAP            = 1,

    /*
     * struct {
     *  struct perf_event_header    header;
     *  u64             id;
     *  u64             lost;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_LOST            = 2,

    /*
     * struct {
     *  struct perf_event_header    header;
     *
     *  u32             pid, tid;
     *  char                comm[];
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_COMM            = 3,

    /*
     * struct {
     *  struct perf_event_header    header;
     *  u32             pid, ppid;
     *  u32             tid, ptid;
     *  u64             time;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_EXIT            = 4,

    /*
     * struct {
     *  struct perf_event_header    header;
     *  u64             time;
     *  u64             id;
     *  u64             stream_id;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_THROTTLE            = 5,
    PERF_RECORD_UNTHROTTLE          = 6,

    /*
     * struct {
     *  struct perf_event_header    header;
     *  u32             pid, ppid;
     *  u32             tid, ptid;
     *  u64             time;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_FORK            = 7,

    /*
     * struct {
     *  struct perf_event_header    header;
     *  u32             pid, tid;
     *
     *  struct read_format      values;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_READ            = 8,

    /*
     * struct {
     *  struct perf_event_header    header;
     *
     *  #
     *  # Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID.
     *  # The advantage of PERF_SAMPLE_IDENTIFIER is that its position
     *  # is fixed relative to header.
     *  #
     *
     *  { u64           id;   } && PERF_SAMPLE_IDENTIFIER
     *  { u64           ip;   } && PERF_SAMPLE_IP
     *  { u32           pid, tid; } && PERF_SAMPLE_TID
     *  { u64           time;     } && PERF_SAMPLE_TIME
     *  { u64           addr;     } && PERF_SAMPLE_ADDR
     *  { u64           id;   } && PERF_SAMPLE_ID
     *  { u64           stream_id;} && PERF_SAMPLE_STREAM_ID
     *  { u32           cpu, res; } && PERF_SAMPLE_CPU
     *  { u64           period;   } && PERF_SAMPLE_PERIOD
     *
     *  { struct read_format    values;   } && PERF_SAMPLE_READ
     *
     *  { u64           nr,
     *    u64           ips[nr];  } && PERF_SAMPLE_CALLCHAIN
     *
     *  #
     *  # The RAW record below is opaque data wrt the ABI
     *  #
     *  # That is, the ABI doesn't make any promises wrt to
     *  # the stability of its content, it may vary depending
     *  # on event, hardware, kernel version and phase of
     *  # the moon.
     *  #
     *  # In other words, PERF_SAMPLE_RAW contents are not an ABI.
     *  #
     *
     *  { u32           size;
     *    char                  data[size];}&& PERF_SAMPLE_RAW
     *
     *  { u64                   nr;
     *        { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK
     *
     *  { u64           abi; # enum perf_sample_regs_abi
     *    u64           regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
     *
     *  { u64           size;
     *    char          data[size];
     *    u64           dyn_size; } && PERF_SAMPLE_STACK_USER
     *
     *  { u64           weight;   } && PERF_SAMPLE_WEIGHT
     *  { u64           data_src; } && PERF_SAMPLE_DATA_SRC
     *  { u64           transaction; } && PERF_SAMPLE_TRANSACTION
     *  { u64           abi; # enum perf_sample_regs_abi
     *    u64           regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
     * };
     */
    PERF_RECORD_SAMPLE          = 9,

    /*
     * The MMAP2 records are an augmented version of MMAP, they add
     * maj, min, ino numbers to be used to uniquely identify each mapping
     *
     * struct {
     *  struct perf_event_header    header;
     *
     *  u32             pid, tid;
     *  u64             addr;
     *  u64             len;
     *  u64             pgoff;
     *  u32             maj;
     *  u32             min;
     *  u64             ino;
     *  u64             ino_generation;
     *  u32             prot, flags;
     *  char                filename[];
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_MMAP2           = 10,

    /*
     * Records that new data landed in the AUX buffer part.
     *
     * struct {
     *  struct perf_event_header    header;
     *
     *  u64             aux_offset;
     *  u64             aux_size;
     *  u64             flags;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_AUX             = 11,

    /*
     * Indicates that instruction trace has started
     *
     * struct {
     *  struct perf_event_header    header;
     *  u32             pid;
     *  u32             tid;
     * };
     */
    PERF_RECORD_ITRACE_START        = 12,

    /*
     * Records the dropped/lost sample number.
     *
     * struct {
     *  struct perf_event_header    header;
     *
     *  u64             lost;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_LOST_SAMPLES        = 13,

    /*
     * Records a context switch in or out (flagged by
     * PERF_RECORD_MISC_SWITCH_OUT). See also
     * PERF_RECORD_SWITCH_CPU_WIDE.
     *
     * struct {
     *  struct perf_event_header    header;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_SWITCH          = 14,

    /*
     * CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and
     * next_prev_tid that are the next (switching out) or previous
     * (switching in) pid/tid.
     *
     * struct {
     *  struct perf_event_header    header;
     *  u32             next_prev_pid;
     *  u32             next_prev_tid;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_SWITCH_CPU_WIDE     = 15,

    PERF_RECORD_MAX,            /* non-ABI */
}
2016-01-20 16:14:25 colouful987 阅读数 384

Linux signal events Learning note

reference document:

sigemptyset sigfillset sigprocmask 博文

1.信号的处理

信号的产生,使用kill -l命令查看有哪些信号,具体使用man 7 signal查看详情页。

信息的发生有两个来源:硬件产生和软件产生。硬件顾名思义就是例如键盘比如平常用的组合键(ctrl+c ctrl+z等);软件则是使用系统函数或者命令发出信号。

系统为我们提供的信号发生函数有很多,但是其中最常用的是kill、raise、alarm和 setitimert(定时器吧)。

提供前三个函数的声明:

#include <sys/types.h>
#include <signal.h>
#include <unistd.h>
int kill(pid_t pid,int sig);
int raise(int sig);
unisigned int alarm(unsigned int seconds);

kill函数中sig 为向进程发送的触发信号(pmst:kill的原因有很多,当然要区分。个人理解。),pid取值有三种情况:

  • 正数。那么sig自然会发送给pid号进程喽
  • 0。那么信号信号sig被发送到所有和pid进程同一个进程组的进程(pmst:一个应用不知一个进程吧)
  • -1。信号发给所有进程表中的进程,除了最大的那个进程号。

TODO:以上理解有偏差,还需更正。

raise系统调用向自己发送一个sig信号,我们可以用上面那个函数来实现这个功能。
alarm函数和时间有点关系,允许在seconds秒后发送一个SIGALRM(14)信号。当然也可以使用setitimert+signal来实现。

测试了下alarm()函数

#include <unistd.h>
#include <stdio.h>

main()
{
unsigned int i;
alarm(1);
for(i=0;1;i++)
printf("I=%d",i);
}

执行以上函数输出不定(pmst:虚拟机CentOS 512M,输出8127)。
本来alarm是在设定时间后自行相应的操作,但是缺省会默认执行结束进程操作。

2.信号的操作

信号操作 有时候我们希望进程正确的执行,而不想进程受到信号的影响,比如我
们希望上面那个程序在1秒钟之后不结束.这个时候我们就要进行信号的操作了.
信号操作最常用的方法是信号屏蔽.信号屏蔽要用到下面的几个函数.

以下是几个信号操作的函数声明:

#include <signal.h>
int sigemptyset(sigset_t *set);
int sigfillset(sigset_t *set);
int sigaddset(sigset_t *set,int signo);
int sigdelset(sigset_t *set,int signo);
int sigismember(sigset_t *set,int signo);
int sigprocmask(int how,const sigset_t *set,sigset_t *oset);
  • sigemptyset函数初始化信号集合set,将set设置为空(pmst:我认为就是kill -l输出的那些就是信号集合了,这里选择都清除光)
  • sigfillset也初始化信号集合,只是将信号集合设置为所有信号的集合(pmst:这里则是设置所有信号的集合)
  • sigaddset将信号signo加入到信号集合之中(signo 自然就是信号number)
  • sigdelset将信号从信号集合中删除,同上
  • sigismember查询信号是否在信号集合之中.其实就是sig is member of process的缩写吧
  • sigprocmask是最为关键的一个函数。使用执行sigprocmask函数前,要先设置好信号集合set.这个函数的作用是将指定的信号集合set加入到进程的信号阻塞集合之中去,如果提供了oset那么当前的进程信号阻塞集合将会保存在oset里面.参数how决定函数的操作方式.
    • SIG_BLOCK:增加一个信号集合到当前进程的阻塞集合之中.
    • SIG_UNBLOCK:从当前的阻塞集合之中删除一个信号集合.
    • SIG_SETMASK:将当前的信号集合设置为信号阻塞集合.

注意: 信号的阻塞就是让系统暂时保留信号留待以后发送。(注意:不是不发送,而是延迟发送)一般情况下信号的阻塞只是暂时的,只是为了防止信号打断敏感的操作。

可参考:信号的阻塞

测试用例:

#include <signal.h>
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
int main(int argc,char **argv)
{
double y;
sigset_t intmask;
int i,repeat_factor;
if(argc!=2)
{
fprintf(stderr,"Usage:%s repeat_factor/n/a",argv[0]);
exit(1);
}
if((repeat_factor=atoi(argv[1]))<1)repeat_factor=10;
sigemptyset(&intmask); /* 将信号集合设置为空 */
sigaddset(&intmask,SIGINT); /* 加入中断 Ctrl+C 信号*/
while(1)
{
/*阻塞信号,我们不希望保存原来的集合所以参数为NULL*/
sigprocmask(SIG_BLOCK,&intmask,NULL);
fprintf(stderr,"SIGINT signal blocked/n");
for(i=0;i<repeat_factor;i++)y=sin((double)i);
fprintf(stderr,"Blocked calculation is finished/n");
/* 取消阻塞 */
sigprocmask(SIG_UNBLOCK,&intmask,NULL);
fprintf(stderr,"SIGINT signal unblocked/n");
for(i=0;i<repeat_factor;i++)y=sin((double)i);
fprintf(stderr,"Unblocked calculation is finished/n");
}
exit(0);
}

我们先使用一个自定义的signal集合,然后将SIGINT加入到该信号集合中,即只要ctrl+c就会中断程序。在while(1)里我们使用sigprocmask函数使用SIG_BLOCK指定当SIGINT信号触发时放到信号阻塞集合中,马上执行for(i=0;i<repeat_factor;i++)y=sin((double)i);这条语句,设定repeat_factor 值大一些,所以在这一步中操作时间比较长,此时状态时阻塞的,因此按下ctrl+c就把这个信号放入到阻塞集合中,并非立马触发,只有当unblock后会立马触发;当然倘若直接SIG_UNBLOCK屏蔽信号取消了,这个信号就会发生作用。

此时我们希望对信号做出及时的反应,比如当用户按下Ctrl+C时,我们希望做一些事情,例如告知用户你的这个操作不好吗,请不要重试,而不是什么都告知。此时我们需要用到sigaction函数。

先了解下函数的声明:

#include <signal.h>

/* 函数 */     
int sigaction(int signo,const struct sigaction *act,
struct sigaction *oact);

/* 结构体 */
struct sigaction {
    void (*sa_handler)(int signo);// 函数指针 用于信号处理
    void (*sa_sigaction)(int siginfo_t *info,void *act);//函数指针
    sigset_t sa_mask;   //信号集合
    int sa_flags;       //标志位
    void (*sa_restore)(void);//函数指针 用于重置
}

函数的三个传入参数解释:

  • signo很简单就是我们要处理的信号了,可以是任何的合法的信号.有两个信号不能够使用(SIGKILL和SIGSTOP).
  • act包含我们要对这个信号进行如何处理的信息.具体看下面结构体。
  • oact更简单了就是以前对这个函数的处理信息了,主要用来保存信息的,一般用NULL就OK了

结构体解释:

  • sa_handler是一个函数型指针,这个指针指向一个函数,这个函数有一个参数.这个函数就
    是我们要进行的信号操作的函数.
  • sa_sigaction,sa_restoresa_handler差不多的,只是参数不同罢了.这两个元素我们很少使用,就不管了.
  • sa_flags用来设置信号操作的各个情况.一般设置为0好了.
  • sa_mask 屏蔽信号集合??? 就是加入这个集合的信号都会被屏蔽掉。

在使用的时候我们用sa_handler指向我们的一个信号操作函数,就可以了.sa_handler有两个特殊的值:SIG_DELSIG_IGN.SIG_DEL是使用缺省的信号操作函数,而SIG_IGN是使用忽略该信号的操作函数.

sigaction 测试用例:

#include <signal.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#define PROMPT "你想终止程序吗?"
char *prompt=PROMPT;
void ctrl_c_op(int signo)
{
write(STDERR_FILENO,prompt,strlen(prompt));
}
int main()
{
struct sigaction act;
act.sa_handler=ctrl_c_op;
sigemptyset(&act.sa_mask);
act.sa_flags=0;
if(sigaction(SIGINT,&act,NULL)<0)
{
fprintf(stderr,"Install Signal Action Error:%s/n/a",strerror(errno));
exit(1);
}
while(1);
}

非常简单,就是当SIGINT触发时调用 sigaction 中的handler方法处理。但是如果我们在处理信号的时候又有信号到来,这就有点棘手了。因此我们要使用sigaction 中的sa_mask屏蔽信号集合了!当我们进入handler处理事务时,不想被其他消息干扰,那么就把那些消息加入到sa_mask屏蔽集合中,处理完后 在unblock下。以上是pmst的个人见解。

其他信号函数

先看下函数声明:

pause 和 sigsuspend

#include <unistd.h>
#include <signal.h>
int pause(void);
int sigsuspend(const sigset_t *sigmask);
  • pause函数很简单,就是挂起进程直到一个信号发生了.
  • sigsuspend也是挂起进程只是在调用的时候用sigmask取代当前的信号阻塞集合.

sigsetjmp 和siglongjmp

#include <sigsetjmp>
int sigsetjmp(sigjmp_buf env,int val);
void siglongjmp(sigjmp_buf env,int val);

这两个信号跳转函数也可以实现程序的跳转让我们可以从函数之中跳转到我们需要的地方.

一个实例

#include <unistd.h>
#include <stdio.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <string.h>
#include <pwd.h>
#include <sys/types.h>
#include <sys/stat.h>
/* Linux 的默任个人的邮箱地址是 /var/spool/mail/ */
#define MAIL_DIR "/var/spool/mail/"
/* 睡眠10秒钟 */
#define SLEEP_TIME 10
#define MAX_FILENAME 255
unsigned char notifyflag=1;
long get_file_size(const char *filename)
{
struct stat buf;
if(stat(filename,&;buf)==-1)
{
if(errno==ENOENT)return 0;
else return -1;
}
return (long)buf.st_size;
}
void send_mail_notify(void)
{
fprintf(stderr,"New mail has arrived/007/n");
}
void turn_on_notify(int signo)
{
notifyflag=1;
}
void turn_off_notify(int signo)
{
notifyflag=0;
}
int check_mail(const char *filename)
{
long old_mail_size,new_mail_size;
sigset_t blockset,emptyset;
sigemptyset(&;blockset);
sigemptyset(&;emptyset);
sigaddset(&;blockset,SIGUSR1);
sigaddset(&;blockset,SIGUSR2);
old_mail_size=get_file_size(filename);
if(old_mail_size<0)return 1;
if(old_mail_size>0) send_mail_notify();
sleep(SLEEP_TIME);
while(1)
{
if(sigprocmask(SIG_BLOCK,&;blockset,NULL)<0) return 1;
while(notifyflag==0)sigsuspend(&;emptyset);
if(sigprocmask(SIG_SETMASK,&;emptyset,NULL)<0) return 1;
new_mail_size=get_file_size(filename);
if(new_mail_size>old_mail_size)send_mail_notify;
old_mail_size=new_mail_size;
sleep(SLEEP_TIME);
}
}
int main(void)
{
char mailfile[MAX_FILENAME];
struct sigaction newact;
struct passwd *pw;
if((pw=getpwuid(getuid()))==NULL)
{
fprintf(stderr,"Get Login Name Error:%s/n/a",strerror(errno));
exit(1);
}
strcpy(mailfile,MAIL_DIR);
strcat(mailfile,pw->pw_name);
newact.sa_handler=turn_on_notify;
newact.sa_flags=0;
sigemptyset(&;newact.sa_mask);
sigaddset(&;newact.sa_mask,SIGUSR1);
sigaddset(&;newact.sa_mask,SIGUSR2);
if(sigaction(SIGUSR1,&;newact,NULL)<0)
fprintf(stderr,"Turn On Error:%s/n/a",strerror(errno));
newact.sa_handler=turn_off_notify;
if(sigaction(SIGUSR1,&;newact,NULL)<0)
fprintf(stderr,"Turn Off Error:%s/n/a",strerror(errno));
check_mail(mailfile);
exit(0);
}

关于 sigempty()

简述

sigempty() 是众多函数家庭众多的一员,可用于手工设置信号集合。Signal Set是一些数据对象,允许让一个线程持续管理一个信号组。例如,一个线程或许创建一个信号集合用于记录那些被block的信号,也可以记录那些挂起的信号。

我们可以通过sigprocmask()来管理这些信号集合,传入SIG_BLCOKSIG_UNBLOCKSIG_SETMASK;或者examine signal sets returned by other functions(such as sigpending())

正如sigempty名字所说,它初始化一个特定的集合为空集合。换句话说,所有支持的信号都被排除在外了。

参数

IN:
*set
(Input) A pointer to a signal set.

Return Value
0 sigemptyset() wassuccessful.

Related Information
The

demo

#include <stdio.h>
#include <unistd.h>
#include <signal.h>

int main( int argc, char *argv[] ) {

struct sigaction sigact;
sigset_t sigset;

sigemptyset( &sigact.sa_mask );
sigact.sa_flags = 0;
sigact.sa_handler = SIG_IGN;
sigaction( SIGUSR2, &sigact, NULL );

/*
 * Unblocking all signals ensures that the signal
 * handling action will be taken when the signal
 * is generated.
 */

sigemptyset( &sigset );
sigprocmask( SIG_SETMASK, &sigset, NULL );

printf( "before kill()\n" );
kill( getpid(), SIGUSR2 );
printf( "after kill()\n" );

return( 0 );
}
没有更多推荐了,返回首页