2018-09-06

I have written a blog about kvm interrupt emulation. As we know, the QEMU can emulation the whole system, in this blog, I will disscuss how the QEMU emulate the interrupt chip of a virtual machine. In this blog, we assume that all of the irqchip is emulated in QEMU, set the qemu command line with ‘-machine kernel-irqchip=off’ can achive this.

Interrupt controller initialization

The function ‘pc_init1’ first allocates the ‘pcms->gsi’ to represent the interrupt delivery start point.

pcms->gsi = qemu_allocate_irqs(gsi_handler, gsi_state, GSI_NUM_PINS);

qemu_irq *qemu_allocate_irqs(qemu_irq_handler handler, void *opaque, int n)
{
    return qemu_extend_irqs(NULL, 0, handler, opaque, n);
}

qemu_irq *qemu_extend_irqs(qemu_irq *old, int n_old, qemu_irq_handler handler,
                           void *opaque, int n)
{
    qemu_irq *s;
    int i;

    if (!old) {
        n_old = 0;
    }
    s = old ? g_renew(qemu_irq, old, n + n_old) : g_new(qemu_irq, n);
    for (i = n_old; i < n + n_old; i++) {
        s[i] = qemu_allocate_irq(handler, opaque, i);
    }
    return s;
}

This function allocates 24 ‘qemu_irq’ struct and the handler is set to ‘gsi_handler’. Here ‘gsi’ is the abbreviation of ‘global system interrupts’.

Later, in ‘i440fx_init’, this ‘gsi’ is assigned to ‘piix3->pic’, it also calls ‘pci_bus_irqs’ to set the pci bus’s ‘set_irq’ and ‘get_irq’ function.

    pci_bus_irqs(b, piix3_set_irq, pci_slot_get_pirq, piix3,
            PIIX_NUM_PIRQS);

‘piix3_set_irq’ function finally calls ‘piix3_\set_irq_pic’, which we can see calls the ‘piix3->pic’ which is the ‘gsi’.

static void piix3_set_irq_pic(PIIX3State *piix3, int pic_irq)
{
    qemu_set_irq(piix3->pic[pic_irq],
                 !!(piix3->pic_levels &
                    (((1ULL << PIIX_NUM_PIRQS) - 1) <<
                     (pic_irq * PIIX_NUM_PIRQS))));
}

Return to ‘pc_init1’, it calls ‘isa_bus_irqs’, this function set the ISA bus’s irqs to ‘gsi’.

    isa_bus_irqs(isa_bus, pcms->gsi);

As we emulate irqchip in QEMU, it calls ‘i8259_init’, it first calls ‘pc_allocate_cpu_irq’ to allocate a parent_irq. ‘pic_irq_request’ is used as this irq’s handler.

    if (kvm_pic_in_kernel()) {
        i8259 = kvm_i8259_init(isa_bus);
    } else if (xen_enabled()) {
        i8259 = xen_interrupt_controller_init();
    } else {
        i8259 = i8259_init(isa_bus, pc_allocate_cpu_irq());
    }
    
qemu_irq pc_allocate_cpu_irq(void)
{
    return qemu_allocate_irq(pic_irq_request, NULL, 0);
}

In order to understand function ‘i8259_init’, we need first to look at the i8259 realize function.

static void pic_realize(DeviceState *dev, Error **errp)
{
    PICCommonState *s = PIC_COMMON(dev);
    PICClass *pc = PIC_GET_CLASS(dev);

    memory_region_init_io(&s->base_io, OBJECT(s), &pic_base_ioport_ops, s,
                          "pic", 2);
    memory_region_init_io(&s->elcr_io, OBJECT(s), &pic_elcr_ioport_ops, s,
                          "elcr", 1);

    qdev_init_gpio_out(dev, s->int_out, ARRAY_SIZE(s->int_out));
    qdev_init_gpio_in(dev, pic_set_irq, 8);

    pc->parent_realize(dev, errp);
}

In ‘pic_realize’ function, the most import function is ‘qdev_init_gpio_out’ and ‘qdev_init_gpio_in’.

    void qdev_init_gpio_out(DeviceState *dev, qemu_irq *pins, int n)
    {
        qdev_init_gpio_out_named(dev, pins, NULL, n);
    }
    
    void qdev_init_gpio_out_named(DeviceState *dev, qemu_irq *pins,
                                const char *name, int n)
    {
        int i;
        NamedGPIOList *gpio_list = qdev_get_named_gpio_list(dev, name);
    
        assert(gpio_list->num_in == 0 || !name);
    
        if (!name) {
            name = "unnamed-gpio-out";
        }
        memset(pins, 0, sizeof(*pins) * n);
        for (i = 0; i < n; ++i) {
            gchar *propname = g_strdup_printf("%s[%u]", name,
                                            gpio_list->num_out + i);
    
            object_property_add_link(OBJECT(dev), propname, TYPE_IRQ,
                                    (Object **)&pins[i],
                                    object_property_allow_set_link,
                                    OBJ_PROP_LINK_UNREF_ON_RELEASE,
                                    &error_abort);
            g_free(propname);
        }
        gpio_list->num_out += n;
    }

‘qdev_init_gpio_out’ function add a link property named ‘unamed-gpio-out[0]’ and set the link *child to ‘address of ‘s->int_out’. Likely , ‘qdev_init_gpio_in’ adds 8 ‘unamed-gpio-in[0]’ link property.

Return back function ‘i8259_init’.

    qemu_irq *i8259_init(ISABus *bus, qemu_irq parent_irq)
    {
        qemu_irq *irq_set;
        DeviceState *dev;
        ISADevice *isadev;
        int i;
    
        irq_set = g_new0(qemu_irq, ISA_NUM_IRQS);
    
        isadev = i8259_init_chip(TYPE_I8259, bus, true);
        dev = DEVICE(isadev);
    
        qdev_connect_gpio_out(dev, 0, parent_irq);
        for (i = 0 ; i < 8; i++) {
            irq_set[i] = qdev_get_gpio_in(dev, i);
        }
    
        isa_pic = dev;
    
        isadev = i8259_init_chip(TYPE_I8259, bus, false);
        dev = DEVICE(isadev);
    
        qdev_connect_gpio_out(dev, 0, irq_set[2]);
        for (i = 0 ; i < 8; i++) {
            irq_set[i + 8] = qdev_get_gpio_in(dev, i);
        }
    
        slave_pic = PIC_COMMON(dev);
    
        return irq_set;
    }

First create the master pic and set the output pin (‘s->int_out’) to the ‘parent_irq’, this is done through function ‘qdev_connect_gpio_out_named’ which set the ‘unamed-gpio-out[0]’ link property. Then create the slave pic and set its out pin to the master’s second in pin. Finally return the ‘irq_set’, this is all of the pic’s ‘qemu_irq’.

Then these ‘qemu_irq’ is assigned to ‘gsi_state’ and calls ‘ioapic_init_gsi’ to initialize the IOAPIC.

void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name)
{
    DeviceState *dev;
    SysBusDevice *d;
    unsigned int i;

    if (kvm_ioapic_in_kernel()) {
        dev = qdev_create(NULL, "kvm-ioapic");
    } else {
        dev = qdev_create(NULL, "ioapic");
    }
    if (parent_name) {
        object_property_add_child(object_resolve_path(parent_name, NULL),
                                  "ioapic", OBJECT(dev), NULL);
    }
    qdev_init_nofail(dev);
    d = SYS_BUS_DEVICE(dev);
    sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS);

    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
        gsi_state->ioapic_irq[i] = qdev_get_gpio_in(dev, i);
    }
}

Here create the ioapic device and set the ‘gsi_state->ioapic_irq’ with the ioapic’s ‘qemu_irq’. The later is created in the realize of ioapic device. The handler is ‘ioapic_set_irq’.

Interrupt delivery

Let’s take a PCI device’s interrupt delivery as an example. The PCI device can call ‘pci_set_irq’ to issue an interrupt to the kernel.

void pci_set_irq(PCIDevice *pci_dev, int level)
{
    int intx = pci_intx(pci_dev);
    pci_irq_handler(pci_dev, intx, level);
}
static inline int pci_intx(PCIDevice *pci_dev)
{
    return pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
}

static void pci_irq_handler(void *opaque, int irq_num, int level)
{
    PCIDevice *pci_dev = opaque;
    int change;

    change = level - pci_irq_state(pci_dev, irq_num);
    if (!change)
        return;

    pci_set_irq_state(pci_dev, irq_num, level);
    pci_update_irq_status(pci_dev);
    if (pci_irq_disabled(pci_dev))
        return;
    pci_change_irq_level(pci_dev, irq_num, change);
}

static void pci_change_irq_level(PCIDevice *pci_dev, int irq_num, int change)
{
    PCIBus *bus;
    for (;;) {
        bus = pci_get_bus(pci_dev);
        irq_num = bus->map_irq(pci_dev, irq_num);
        if (bus->set_irq)
            break;
        pci_dev = bus->parent_dev;
    }
    bus->irq_count[irq_num] += change;
    bus->set_irq(bus->irq_opaque, irq_num, bus->irq_count[irq_num] != 0);
}

There are a little PCI-specific knowledge I will not discuss just focus the interrupt instead. In the last function ‘pci_change_irq_level’, it calls the PCI bus’ ‘map_irq’ to get the irq number and then call ‘set_irq’ which as we know is ‘piix3_set_irq’. This function calls ‘piix3_set_irq_pic’.

static void piix3_set_irq_pic(PIIX3State *piix3, int pic_irq)
{
    qemu_set_irq(piix3->pic[pic_irq],
                 !!(piix3->pic_levels &
                    (((1ULL << PIIX_NUM_PIRQS) - 1) <<
                     (pic_irq * PIIX_NUM_PIRQS))));
}

The piix3->pic is the gsi and the handler is ‘gsi_handler’.

void gsi_handler(void *opaque, int n, int level)
{
    GSIState *s = opaque;

    DPRINTF("pc: %s GSI %d\n", level ? "raising" : "lowering", n);
    if (n < ISA_NUM_IRQS) {
        qemu_set_irq(s->i8259_irq[n], level);
    }
    qemu_set_irq(s->ioapic_irq[n], level);
}

Choose the interrupt controller according the irq number and aclls the corresponding handler. Take ioapic_irq as for example, the handler is ‘ioapic_set_irq’. In this function it calls ‘ioapic_service’ to delivery interrupt to the LAPIC. This is through ‘stl_le_phys’, this will cause the apic’s MMIO write function being called, which is ‘apic_mem_writel’. APIC can call ‘apic_update_irq’ to process interrupt. THen ‘cpu_interrupt’ and finally ‘kvm_handle_interrupt’ is called.

static void kvm_handle_interrupt(CPUState *cpu, int mask)
{
    cpu->interrupt_request |= mask;

    if (!qemu_cpu_is_self(cpu)) {
        qemu_cpu_kick(cpu);
    }
}

Here set the ‘cpu->interrupt_request’ then in the next enter the guest, the QEMU will call ioctl with ‘KVM_INTERRUPT’ ioctl to inject the interrupt to the guest.

Let’s see a backtrack of interrupt delivery to make a more deep impression.

(gdb) bt
#0  apic_mem_write (opaque=0x61600000a280, addr=16388, val=33, size=4)
    at /home/test/qemu/hw/intc/apic.c:756
#1  0x000055ce1f7241fd in memory_region_write_accessor (mr=0x61600000a300, 
    addr=16388, value=0x7f329b8f8188, size=4, shift=0, mask=4294967295, 
    attrs=...) at /home/test/qemu/memory.c:526
#2  0x000055ce1f7244d6 in access_with_adjusted_size (addr=16388, 
    value=0x7f329b8f8188, size=4, access_size_min=1, access_size_max=4, 
    access_fn=0x55ce1f72404f <memory_region_write_accessor>, 
    mr=0x61600000a300, attrs=...) at /home/test/qemu/memory.c:593
#3  0x000055ce1f72b2cc in memory_region_dispatch_write (mr=0x61600000a300, 
    addr=16388, data=33, size=4, attrs=...) at /home/test/qemu/memory.c:1473
#4  0x000055ce1f65021b in address_space_stl_internal (
    as=0x55ce2142c940 <address_space_memory>, addr=4276109316, val=33, 
    attrs=..., result=0x0, endian=DEVICE_LITTLE_ENDIAN)
    at /home/test/qemu/memory_ldst.inc.c:349
#5  0x000055ce1f65047f in address_space_stl_le (
    as=0x55ce2142c940 <address_space_memory>, addr=4276109316, val=33, 
    attrs=..., result=0x0) at /home/test/qemu/memory_ldst.inc.c:386
#6  0x000055ce1f80aff5 in stl_le_phys (
    as=0x55ce2142c940 <address_space_memory>, addr=4276109316, val=33)
    at /home/test/qemu/include/exec/memory_ldst_phys.inc.h:103
#7  0x000055ce1f80c8af in ioapic_service (s=0x61b000002a80)
    at /home/test/qemu/hw/intc/ioapic.c:136
---Type <return> to continue, or q <return> to quit---
#8  0x000055ce1f80cb35 in ioapic_set_irq (opaque=0x61b000002a80, vector=15, 
    level=1) at /home/test/qemu/hw/intc/ioapic.c:175
#9  0x000055ce1fbe79a0 in qemu_set_irq (irq=0x60600006a880, level=1)
    at hw/core/irq.c:45
#10 0x000055ce1f8bfb1c in gsi_handler (opaque=0x612000007540, n=15, level=1)
    at /home/test/qemu/hw/i386/pc.c:120
#11 0x000055ce1fbe79a0 in qemu_set_irq (irq=0x6060000414e0, level=1)
    at hw/core/irq.c:45
#12 0x000055ce1fc8a0f3 in bmdma_irq (opaque=0x6250001c3e10, n=0, level=1)
    at hw/ide/pci.c:222
#13 0x000055ce1fbe79a0 in qemu_set_irq (irq=0x606000091280, level=1)
    at hw/core/irq.c:45
#14 0x000055ce1fc7ba3c in qemu_irq_raise (irq=0x606000091280)
    at /home/test/qemu/include/hw/irq.h:16
#15 0x000055ce1fc7bb20 in ide_set_irq (bus=0x6250001c32c0)
    at /home/test/qemu/include/hw/ide/internal.h:568
#16 0x000055ce1fc7fa75 in ide_atapi_cmd_reply_end (s=0x6250001c3338)
    at hw/ide/atapi.c:319
#17 0x000055ce1fc7902c in ide_data_readl (opaque=0x6250001c32c0, addr=368)
    at hw/ide/core.c:2389
#18 0x000055ce1f713e32 in portio_read (opaque=0x614000002040, addr=0, size=4)
    at /home/test/qemu/ioport.c:180
#19 0x000055ce1f7239bb in memory_region_read_accessor (mr=0x614000002040, 
---Type <return> to continue, or q <return> to quit---
    addr=0, value=0x7f329b8f8790, size=4, shift=0, mask=4294967295, attrs=...)
    at /home/test/qemu/memory.c:435
#20 0x000055ce1f7244d6 in access_with_adjusted_size (addr=0, 
    value=0x7f329b8f8790, size=4, access_size_min=1, access_size_max=4, 
    access_fn=0x55ce1f723913 <memory_region_read_accessor>, mr=0x614000002040, 
    attrs=...) at /home/test/qemu/memory.c:593
#21 0x000055ce1f72aa42 in memory_region_dispatch_read1 (mr=0x614000002040, 
    addr=0, pval=0x7f329b8f8790, size=4, attrs=...)
    at /home/test/qemu/memory.c:1392
#22 0x000055ce1f72ac25 in memory_region_dispatch_read (mr=0x614000002040, 
    addr=0, pval=0x7f329b8f8790, size=4, attrs=...)
    at /home/test/qemu/memory.c:1423
#23 0x000055ce1f64cd0c in flatview_read_continue (fv=0x60600017f120, addr=368, 
    attrs=..., buf=0x7f329e50c004 "", len=4, addr1=0, l=4, mr=0x614000002040)
    at /home/test/qemu/exec.c:3293
#24 0x000055ce1f64d028 in flatview_read (fv=0x60600017f120, addr=368, 
    attrs=..., buf=0x7f329e50c004 "", len=4) at /home/test/qemu/exec.c:3331
#25 0x000055ce1f64d0ed in address_space_read_full (
    as=0x55ce2142c8c0 <address_space_io>, addr=368, attrs=..., 
    buf=0x7f329e50c004 "", len=4) at /home/test/qemu/exec.c:3344
#26 0x000055ce1f64d1c4 in address_space_rw (
    as=0x55ce2142c8c0 <address_space_io>, addr=368, attrs=..., 
    buf=0x7f329e50c004 "", len=4, is_write=false)
---Type <return> to continue, or q <return> to quit---
    at /home/test/qemu/exec.c:3374
#27 0x000055ce1f770021 in kvm_handle_io (port=368, attrs=..., 
    data=0x7f329e50c000, direction=0, size=4, count=2)
    at /home/test/qemu/accel/kvm/kvm-all.c:1731
#28 0x000055ce1f7712f9 in kvm_cpu_exec (cpu=0x631000028800)
    at /home/test/qemu/accel/kvm/kvm-all.c:1971
#29 0x000055ce1f6e5650 in qemu_kvm_cpu_thread_fn (arg=0x631000028800)
    at /home/test/qemu/cpus.c:1257
#30 0x000055ce20354746 in qemu_thread_start (args=0x603000024a60)
    at util/qemu-thread-posix.c:504
#31 0x00007f32a175b6db in start_thread (arg=0x7f329b8f9700)
    at pthread_create.c:463
#32 0x00007f32a148488f in clone ()


blog comments powered by Disqus