Introduction Link to heading

Graphics stack has long history in the linux kernel. The modern graphics stack consists of the following layers. Starting with user-space libraries(libdrm and libmesa) and then to kernel graphics modules (KMS, DRM)

An ASCII art would be something like:

application -> opengl Mesa -> libdrm -> Linux Kernel (DRM) -> GPU

game -> opengl Mesa -> libdrm -> Linux Kernel (DRM) -> GPU(framebuffer)

A good diagram is from wiki:

Example image

kernel DRM Link to heading

I will start from kernel DRM and work my up to user-land when i need to. But the focus will in the kernel.

Starting at linux/drivers/gpu/drm/i915/i915_driver.c with i915 (Intel graphics DRM driver for integrated graphics), drm_driver is defined and registered with drm_dev_register.

static const struct drm_driver i915_drm_driver = {
	/* Don't use MTRRs here; the Xserver or userspace app should
	 * deal with them for Intel hardware.
	 */
	.driver_features =
	    DRIVER_GEM |
	    DRIVER_RENDER | DRIVER_MODESET | DRIVER_ATOMIC | DRIVER_SYNCOBJ |
	    DRIVER_SYNCOBJ_TIMELINE,
	.release = i915_driver_release,
	.open = i915_driver_open,
	.lastclose = i915_driver_lastclose,
	.postclose = i915_driver_postclose,
	.show_fdinfo = PTR_IF(IS_ENABLED(CONFIG_PROC_FS), i915_drm_client_fdinfo),

	.gem_prime_import = i915_gem_prime_import,

	.dumb_create = i915_gem_dumb_create,
	.dumb_map_offset = i915_gem_dumb_mmap_offset,

	.ioctls = i915_ioctls,
	.num_ioctls = ARRAY_SIZE(i915_ioctls),
	.fops = &i915_driver_fops,
	.name = DRIVER_NAME,
	.desc = DRIVER_DESC,
	.date = DRIVER_DATE,
	.major = DRIVER_MAJOR,
	.minor = DRIVER_MINOR,
	.patchlevel = DRIVER_PATCHLEVEL,
};

This is the call order starting from PCI driver registration inux/drivers/gpu/drm/i915/i915_pci.c all the way down to call drm_dev_register

static struct pci_driver i915_pci_driver = {
	.name = DRIVER_NAME,
	.id_table = pciidlist,
	.probe = i915_pci_probe,
	.remove = i915_pci_remove,
	.shutdown = i915_pci_shutdown,
	.driver.pm = &i915_pm_ops,
};

int i915_pci_register_driver(void)
{
	return pci_register_driver(&i915_pci_driver);
}

i915_pci_probe is called to probe the GPU driver by calling i915_driver_probe.

static int i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
	struct intel_device_info *intel_info =
		(struct intel_device_info *) ent->driver_data;
	int err;

	if (intel_info->require_force_probe && !id_forced(pdev->device)) {
		dev_info(&pdev->dev,
			 "Your graphics device %04x is not properly supported by i915 in this\n"
			 "kernel version. To force driver probe anyway, use i915.force_probe=%04x\n"
			 "module parameter or CONFIG_DRM_I915_FORCE_PROBE=%04x configuration option,\n"
			 "or (recommended) check for kernel updates.\n",
			 pdev->device, pdev->device, pdev->device);
		return -ENODEV;
	}

	if (id_blocked(pdev->device)) {
		dev_info(&pdev->dev, "I915 probe blocked for Device ID %04x.\n",
			 pdev->device);
		return -ENODEV;
	}

	if (intel_info->require_force_probe) {
		dev_info(&pdev->dev, "Force probing unsupported Device ID %04x, tainting kernel\n",
			 pdev->device);
		add_taint(TAINT_USER, LOCKDEP_STILL_OK);
	}

	/* Only bind to function 0 of the device. Early generations
	 * used function 1 as a placeholder for multi-head. This causes
	 * us confusion instead, especially on the systems where both
	 * functions have the same PCI-ID!
	 */
	if (PCI_FUNC(pdev->devfn))
		return -ENODEV;

	if (!intel_mmio_bar_valid(pdev, intel_info))
		return -ENXIO;

	/* Detect if we need to wait for other drivers early on */
	if (intel_display_driver_probe_defer(pdev))
		return -EPROBE_DEFER;

	err = i915_driver_probe(pdev, ent);

In i915_driver_probe, i915_driver_create is called with device pdev.

int i915_driver_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
	struct drm_i915_private *i915;
	int ret;

	ret = pci_enable_device(pdev);
	if (ret) {
		pr_err("Failed to enable graphics device: %pe\n", ERR_PTR(ret));
		return ret;
	}

	i915 = i915_driver_create(pdev, ent);

And i915_driver_create registers with DRM core by calling devm_drm_dev_alloc.

static struct drm_i915_private *
i915_driver_create(struct pci_dev *pdev, const struct pci_device_id *ent)
{
	const struct intel_device_info *match_info =
		(struct intel_device_info *)ent->driver_data;
	struct drm_i915_private *i915;

	i915 = devm_drm_dev_alloc(&pdev->dev, &i915_drm_driver,
				  struct drm_i915_private, drm);

back to i915_driver_probe which also calls i915_driver_register.

static void i915_driver_register(struct drm_i915_private *dev_priv)
{
	struct intel_gt *gt;
	unsigned int i;

	i915_gem_driver_register(dev_priv);
	i915_pmu_register(dev_priv);

	intel_vgpu_register(dev_priv);

	/* Reveal our presence to userspace */
	if (drm_dev_register(&dev_priv->drm, 0)) {
		drm_err(&dev_priv->drm,
			"Failed to register driver for userspace access!\n");
		return;
	}

The driver also registers ioctl that are called from user land.

#define DRM_IOCTL_DEF_DRV(ioctl, _func, _flags)				\
	[DRM_IOCTL_NR(DRM_IOCTL_##ioctl) - DRM_COMMAND_BASE] = {	\
		.cmd = DRM_IOCTL_##ioctl,				\
		.func = _func,						\
		.flags = _flags,					\
		.name = #ioctl						\
	}

static const struct drm_ioctl_desc i915_ioctls[] = {
	DRM_IOCTL_DEF_DRV(I915_INIT, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
	DRM_IOCTL_DEF_DRV(I915_FLUSH, drm_noop, DRM_AUTH),
	DRM_IOCTL_DEF_DRV(I915_FLIP, drm_noop, DRM_AUTH),
	DRM_IOCTL_DEF_DRV(I915_BATCHBUFFER, drm_noop, DRM_AUTH),
	DRM_IOCTL_DEF_DRV(I915_IRQ_EMIT, drm_noop, DRM_AUTH),
	DRM_IOCTL_DEF_DRV(I915_IRQ_WAIT, drm_noop, DRM_AUTH),
	DRM_IOCTL_DEF_DRV(I915_GETPARAM, i915_getparam_ioctl, DRM_RENDER_ALLOW),
	DRM_IOCTL_DEF_DRV(I915_SETPARAM, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
	DRM_IOCTL_DEF_DRV(I915_ALLOC, drm_noop, DRM_AUTH),
	DRM_IOCTL_DEF_DRV(I915_FREE, drm_noop, DRM_AUTH),
	DRM_IOCTL_DEF_DRV(I915_INIT_HEAP, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
	DRM_IOCTL_DEF_DRV(I915_CMDBUFFER, drm_noop, DRM_AUTH),
	DRM_IOCTL_DEF_DRV(I915_DESTROY_HEAP,  drm_noop, DRM_AUTH|DRM_MASTER)
  ...
	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2_WR, i915_gem_execbuffer2_ioctl, DRM_RENDER_ALLOW),
	DRM_IOCTL_DEF_DRV(I915_GEM_PIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
  ...

An example of ioctl can be found in drm/intel/intel_bufmgr_gem.c, where do_exec2 calls ioctl. and it’s called from several places, one of these places if drm_intel_gem_bo_exec2.

do_exec2(drm_intel_bo *bo, int used, drm_intel_context *ctx,
	 drm_clip_rect_t *cliprects, int num_cliprects, int DR4,
	 int in_fence, int *out_fence,
	 unsigned int flags)
{
  ...
  ...
	ret = drmIoctl(bufmgr_gem->fd,
		       DRM_IOCTL_I915_GEM_EXECBUFFER2_WR,


static int
drm_intel_gem_bo_exec2(drm_intel_bo *bo, int used,
		       drm_clip_rect_t *cliprects, int num_cliprects,
		       int DR4)
{
	return do_exec2(bo, used, NULL, cliprects, num_cliprects, DR4,
			-1, NULL, I915_EXEC_RENDER);
}

drm_public int
drm_intel_bo_exec(drm_intel_bo *bo, int used,
		  drm_clip_rect_t * cliprects, int num_cliprects, int DR4)
{
	return bo->bufmgr->bo_exec(bo, used, cliprects, num_cliprects, DR4);
}

	bufmgr_gem->bufmgr.bo_exec = drm_intel_gem_bo_exec2;

Going one layer higher to mesa mesa/src/gallium/winsys/i915/drm/i915_drm_batchbuffer.c. mesa(or opengl) fills out the batch buffers with commands(shaders) and passes it down to kernel DRM through libdrm interfaces above(drm_intel_bo_execfor example)

static void
i915_drm_batchbuffer_flush(struct i915_winsys_batchbuffer *ibatch,
                           struct pipe_fence_handle **fence,
                           enum i915_winsys_flush_flags flags)
{
   struct i915_drm_batchbuffer *batch = i915_drm_batchbuffer(ibatch);
   unsigned used;
   int ret;

   /* MI_BATCH_BUFFER_END */
   i915_winsys_batchbuffer_dword_unchecked(ibatch, (0xA<<23));

   used = batch->base.ptr - batch->base.map;
   if (used & 4) {
      /* MI_NOOP */
      i915_winsys_batchbuffer_dword_unchecked(ibatch, 0);
      used += 4;
   }

   /* Do the sending to HW */
   ret = drm_intel_bo_subdata(batch->bo, 0, used, batch->base.map);
   if (ret == 0 && i915_drm_winsys(ibatch->iws)->send_cmd)
      ret = drm_intel_bo_exec(batch->bo, used, NULL, 0, 0);


void
i915_flush(struct i915_context *i915, struct pipe_fence_handle **fence,
           unsigned flags)
{
   struct i915_winsys_batchbuffer *batch = i915->batch;

   batch->iws->batchbuffer_flush(batch, fence, flags);



#define FLUSH_BATCH(fence, flags) i915_flush(i915, fence, flags)



void
i915_clear_emit(struct pipe_context *pipe, unsigned buffers,
                const union pipe_color_union *color, double depth,
                unsigned stencil, unsigned destx, unsigned desty,
                unsigned width, unsigned height)
{
...
...

      OUT_BATCH(_3DPRIMITIVE | PRIM3D_CLEAR_RECT | 5);
      OUT_BATCH_F(destx + width);
      OUT_BATCH_F(desty + height);
      OUT_BATCH_F(destx);
      OUT_BATCH_F(desty + height);
      OUT_BATCH_F(destx);
      OUT_BATCH_F(desty);

      if (!BEGIN_BATCH(1 + 7 + 7)) {
         FLUSH_BATCH(NULL, I915_FLUSH_ASYNC);

At this point, we have seen i916 DRM driver and ioctl initialization. Let’s dig deeper into those.

Let’s look at ioctl i915_gem_execbuffer2_ioctl in drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c which calls i915_gem_do_execbuffer

int
i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data,
			   struct drm_file *file)
{
  ...
  ...
	if (copy_from_user(exec2_list,
			   u64_to_user_ptr(args->buffers_ptr),
			   sizeof(*exec2_list) * count)) {
	}

  	err = eb_relocate_parse(&eb);
    ...
    ...
	err = i915_gem_do_execbuffer(dev, file, args, exec2_list);

	/*

In i915_gem_do_execbuffer, execution buffer is created from commands sent from user-land. and some setup, it calls eb_submit.

static int
i915_gem_do_execbuffer(struct drm_device *dev,
		       struct drm_file *file,
		       struct drm_i915_gem_execbuffer2 *args,
		       struct drm_i915_gem_exec_object2 *exec)
{
	struct drm_i915_private *i915 = to_i915(dev);
	struct i915_execbuffer eb;
...
  	err = eb_submit(&eb);

static int eb_submit(struct i915_execbuffer *eb)
{
	unsigned int i;
	int err;

	err = eb_move_to_gpu(eb);

	for_each_batch_create_order(eb, i) {
		if (!eb->requests[i])
			break;

		trace_i915_request_queue(eb->requests[i], eb->batch_flags);
		if (!err)
			err = eb_request_submit(eb, eb->requests[i],
						eb->batches[i]->vma,
						eb->batch_len[i]);
	}

	return err;
}

eb_request_submit is called from eb_submit after eb_move_tp_gpu was called to move stuff to GPU memory.

static int eb_request_submit(struct i915_execbuffer *eb,
			     struct i915_request *rq,
			     struct i915_vma *batch,
			     u64 batch_len)
{
	...
  ...
	err = rq->context->engine->emit_bb_start(rq,
						 i915_vma_offset(batch) +
						 eb->batch_start_offset,
						 batch_len,
						 eb->batch_flags);

In the chain of calls from start, it calls MI_BATCH_BUFFER_START from Intel’s instruction programming guide

The MI_BATCH_BUFFER_START command is used to initiate the execution of commands stored in a batch buffer. For restrictions on the location of batch buffers, see Batch Buffers in the Device Programming Interface chapter of MI Functions. The batch buffer can be specified as privileged or non-privileged, determining the operations considered valid when initiated from within the buffer and any attached (chained) batch buffers. See Batch Buffer Protection in the Device Programming Interface chapter of MI Functions.

int gen6_emit_bb_start(struct i915_request *rq,
		       u64 offset, u32 len,
		       unsigned int dispatch_flags)
{
	u32 security;
	u32 *cs;

	security = MI_BATCH_NON_SECURE_I965;
	if (dispatch_flags & I915_DISPATCH_SECURE)
		security = 0;

	cs = intel_ring_begin(rq, 2);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	cs = __gen6_emit_bb_start(cs, offset, security);
	intel_ring_advance(rq, cs);

	return 0;
}


static inline u32 *__gen6_emit_bb_start(u32 *cs, u32 addr, unsigned int flags)
{
	*cs++ = MI_BATCH_BUFFER_START | flags;
	*cs++ = addr;

	return cs;
}

Note that driver does some checks before submitting to GPU. For example, intel_engine_cmd_parser is called from eb_relocate_parse (see ioctl entry point above.)

/**
 * intel_engine_cmd_parser() - parse a batch buffer for privilege violations
 * @engine: the engine on which the batch is to execute
 * @batch: the batch buffer in question
 * @batch_offset: byte offset in the batch at which execution starts
 * @batch_length: length of the commands in batch_obj
 * @shadow: validated copy of the batch buffer in question
 * @trampoline: true if we need to trampoline into privileged execution
 *
 * Parses the specified batch buffer looking for privilege violations as
 * described in the overview.
 *
 * Return: non-zero if the parser finds violations or otherwise fails; -EACCES
 * if the batch appears legal but should use hardware parsing
 */

int intel_engine_cmd_parser(struct intel_engine_cs *engine,
			    struct i915_vma *batch,
			    unsigned long batch_offset,
			    unsigned long batch_length,
			    struct i915_vma *shadow,
			    bool trampoline)
{

Intel GPU package Link to heading

To see the load on GPU, There is a package intel-gpu-tools which have some tools like intel_gpu_top.