qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 2/2] virtio-blk: add zoned storage emulation for zoned device


From: Stefan Hajnoczi
Subject: Re: [PATCH 2/2] virtio-blk: add zoned storage emulation for zoned devices
Date: Mon, 19 Sep 2022 17:25:23 -0400

On Sat, Sep 10, 2022 at 02:50:57PM +0800, Sam Li wrote:
> This patch extends virtio-blk emulation to handle zoned device commands
> by calling the new block layer APIs to perform zoned device I/O on
> behalf of the guest. It supports Report Zone, four zone oparations (open,
> close, finish, reset), and Append Zone.
> 
> The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> support zoned block devices. Regular block devices(conventional zones)
> will not be set.
> 
> The guest os having zoned device support can use blkzone(8) to test those
> commands. Furthermore, using zonefs to test zone append write is also
> supported.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  hw/block/virtio-blk.c | 326 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 326 insertions(+)
> 
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index e9ba752f6b..3ef74c01db 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -46,6 +46,8 @@ static const VirtIOFeature feature_sizes[] = {
>       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
>      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
>       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> +     .end = endof(struct virtio_blk_config, zoned)},
>      {}
>  };
>  
> @@ -614,6 +616,273 @@ err:
>      return err_status;
>  }
>  
> +typedef struct ZoneCmdData {
> +    VirtIOBlockReq *req;
> +    union {
> +        struct {
> +            unsigned int nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } ZoneReportData;
> +        struct {
> +            int64_t append_sector;
> +        } ZoneAppendData;

Field names should be lowercase:

  struct {
      unsigned int nr_zones;
      BlockZoneDescriptor *zones;
  } zone_report_data;
  struct {
      int64_t append_sector;
  } zone_append_data;

> +    };
> +} ZoneCmdData;
> +
> +/*
> + * check zone_model: error checking before issuing requests. If all checks

Maybe rename it to check_zoned_request()? It does more than check the
model.

> + * passed, return true.
> + * append: true if only zone append request issued.
> + */
> +static bool check_zone_model(VirtIOBlock *s, int64_t sector, int64_t 
> nr_sector,
> +                             bool append, uint8_t *status) {
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    BlockZoneDescriptor *zone = &bs->bl.zones[sector / bs->bl.zone_sectors];

Inputs from the guest driver are untrusted and must be validated before
using them. sector could have any value here, including invalid values.
Please check that sector is less than the device capacity and also that
it is positive.

> +    int64_t max_append_sector = bs->bl.max_append_sectors;
> +
> +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> +        *status = VIRTIO_BLK_S_UNSUPP;
> +        return false;
> +    }
> +
> +    if (zone->cond == BLK_ZS_OFFLINE) {
> +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        return false;
> +    }
> +
> +    if (append) {
> +        if ((zone->type != BLK_ZT_SWR) || (zone->cond == BLK_ZS_RDONLY) ||
> +            (sector + nr_sector > (*(zone + 1)).start)) {
> +            /* the end sector of the request exceeds to next zone */
> +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            return false;
> +        }
> +
> +        if (nr_sector > max_append_sector) {
> +            if (max_append_sector == 0) {
> +                *status = VIRTIO_BLK_S_UNSUPP;
> +            } else {
> +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            }
> +            return false;
> +        }
> +    }
> +    return true;
> +}
> +
> +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> +{
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> +    struct iovec *in_iov = req->elem.in_sg;
> +    unsigned in_num = req->elem.in_num;
> +    int64_t zrp_size, nz, n, j = 0;
> +    int8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    nz = data->ZoneReportData.nr_zones;
> +    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
> +            .nr_zones = cpu_to_le64(nz),
> +    };
> +
> +    zrp_size = sizeof(struct virtio_blk_zone_report)
> +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> +    if (n != sizeof(zrp_hdr)) {
> +        virtio_error(vdev, "Driver provided intput buffer that is too 
> small!");
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    for (size_t i = sizeof(zrp_hdr); i < zrp_size; i += sizeof(struct 
> virtio_blk_zone_descriptor), ++j) {
> +        struct virtio_blk_zone_descriptor desc =
> +                (struct virtio_blk_zone_descriptor) {
> +                        .z_start = 
> cpu_to_le64(data->ZoneReportData.zones[j].start),
> +                        .z_cap = 
> cpu_to_le64(data->ZoneReportData.zones[j].cap),
> +                        .z_wp = 
> cpu_to_le64(data->ZoneReportData.zones[j].wp),

If the QEMU block layer uses byte constants it will be necessary to
convert the above fields to sectors. I think the code is correct right
now because the QEMU block layer patches still use sectors, but using
bytes would be consistent with the QEMU block layer conventions.

> +                        .z_type = data->ZoneReportData.zones[j].type,
> +                        .z_state = data->ZoneReportData.zones[j].cond,

The QEMU type and cond field constants might not match the virtio-blk
constants. Please convert them explicitly (e.g. with a switch
statement in a helper function) so there is no assumption about the
values of the constants.

> +                };
> +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));

This is O(n^2) time complexity since it's called from the loop, but
nevermind for now... Maybe add a TODO comment so anyone trying to
optimize this code will immediately see the expensive part.

> +        if (n != sizeof(desc)) {
> +            virtio_error(vdev, "Driver provided input buffer "
> +                               "for descriptors that is too small!");
> +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            goto out;
> +        }
> +    }
> +    goto out;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data->ZoneReportData.zones);
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req) {
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    unsigned int nr_zones;
> +    ZoneCmdData *data;
> +    int64_t zone_size, offset;
> +    uint8_t err_status;
> +
> +    if (req->in_len <= sizeof(struct virtio_blk_inhdr) +
> +                       sizeof(struct virtio_blk_zone_report)) {
> +        virtio_error(vdev, "in buffer too small for zone report");
> +        return -1;
> +    }
> +
> +    /* start byte offset of the zone report */
> +    offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> +    if (!check_zone_model(s, offset / 512, 0, false, &err_status)) {
> +        goto out;
> +    }
> +
> +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> +                sizeof(struct virtio_blk_zone_report)) /
> +               sizeof(struct virtio_blk_zone_descriptor);
> +
> +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> +    data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +    data->ZoneReportData.nr_zones = nr_zones;
> +    data->ZoneReportData.zones = g_malloc(zone_size),

nr_zones = 0 and zones = NULL is possible when in_len is sizeof(struct
virtio_blk_inhdr) + sizeof(struct virtio_blk_zone_report) + 1. Maybe the
code handles it okay without dereferencing a NULL pointer, but it would
be safer to change the check above like this:

  if (req->in_len < sizeof(struct virtio_blk_inhdr) +
                    sizeof(struct virtio_blk_zone_report) +
                    sizeof(struct virtio_blk_zone_descriptor)) {
      virtio_error(vdev, "in buffer too small for zone report");
      return -1;
  }

> +
> +    blk_aio_zone_report(s->blk, offset, &data->ZoneReportData.nr_zones,
> +                        data->ZoneReportData.zones,
> +                        virtio_blk_zone_report_complete, data);
> +    return 0;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
> +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret) {
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;

Missing ret < 0 error handling.

> +
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) {
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> +    uint64_t len;
> +    uint32_t type;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (!check_zone_model(s, offset / 512, 0, false, &err_status)) {
> +        goto out;
> +    }
> +
> +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +
> +    type = virtio_ldl_p(vdev, &req->out.type);
> +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> +        /* Entire drive capacity */
> +        offset = 0;
> +        blk_get_geometry(s->blk, &len);
> +        len *= 512;
> +    } else {
> +        len = bs->bl.zone_sectors * 512;

Is this correct when accessing the last zone of the device?

> +    }
> +
> +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> +                      virtio_blk_zone_mgmt_complete, data);
> +
> +    return 0;
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
> +static void virtio_blk_zone_append_complete(void *opaque, int ret) {
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> +    int64_t append_sector, n;
> +    struct iovec *out_iov = req->elem.out_sg;
> +    unsigned out_num = req->elem.out_num;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;

Error handling code for the ret < 0 case is missing.

> +
> +    append_sector = data->ZoneAppendData.append_sector;
> +    int write_granularity = s->conf.conf.logical_block_size;
> +    if ((append_sector * 512 % write_granularity) != 0) {
> +        err_status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> +        goto out;
> +    }

This looks like input validation. Why is it performed after the I/O
request has completed?

I guess the intent is to check the value that the guest driver provided,
not the value produced by the device after the I/O request completed?

> +    n = iov_to_buf(out_iov, out_num, 0, &append_sector, 
> sizeof(append_sector));

Please use virtio_stq_p() on append_sector first to ensure that the
endianness is correct.

> +    if (n != sizeof(append_sector)) {
> +        virtio_error(vdev, "Driver provided input buffer less than size of "
> +                     "append_sector");
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +    goto out;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req) {

The return value is not used. Please change the return type to void.

> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    uint64_t niov = req->elem.out_num;
> +    struct iovec *out_iov = req->elem.out_sg;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> +    int64_t len = 0;
> +    for (int i = 1; i < niov; ++i) {
> +        len += out_iov[i].iov_len;
> +    }

Why is the first iovec skipped?

> +
> +    if (!check_zone_model(s, offset / 512, len / 512, true, &err_status)) {
> +        goto out;
> +    }
> +
> +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +    data->ZoneAppendData.append_sector = offset;
> +    qemu_iovec_init_external(&req->qiov, &out_iov[1], niov-1);
> +    blk_aio_zone_append(s->blk, &data->ZoneAppendData.append_sector, 
> &req->qiov, 0,
> +                        virtio_blk_zone_append_complete, data);
> +
> +    return 0;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
>  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer 
> *mrb)
>  {
>      uint32_t type;
> @@ -700,6 +969,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq 
> *req, MultiReqBuffer *mrb)
>      case VIRTIO_BLK_T_FLUSH:
>          virtio_blk_handle_flush(req, mrb);
>          break;
> +    case VIRTIO_BLK_T_ZONE_REPORT:
> +        virtio_blk_handle_zone_report(req);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_OPEN:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_CLOSE:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_FINISH:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_RESET:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET_ALL);
> +        break;
>      case VIRTIO_BLK_T_SCSI_CMD:
>          virtio_blk_handle_scsi(req);
>          break;
> @@ -718,6 +1005,9 @@ static int virtio_blk_handle_request(VirtIOBlockReq 
> *req, MultiReqBuffer *mrb)
>          virtio_blk_free_request(req);
>          break;
>      }
> +   case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
> +       virtio_blk_handle_zone_append(req);
> +       break;
>      /*
>       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
>       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch 
> statement,
> @@ -917,6 +1207,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, 
> uint8_t *config)
>  {
>      VirtIOBlock *s = VIRTIO_BLK(vdev);
>      BlockConf *conf = &s->conf.conf;
> +    BlockDriverState *state = blk_bs(s->blk);

Usually the variable is called "bs". Please use that name here and
elsewhere in the patch.

>      struct virtio_blk_config blkcfg;
>      uint64_t capacity;
>      int64_t length;
> @@ -976,6 +1267,31 @@ static void virtio_blk_update_config(VirtIODevice 
> *vdev, uint8_t *config)
>          blkcfg.write_zeroes_may_unmap = 1;
>          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
>      }
> +#ifdef CONFIG_BLKZONED

Is this ifdef appropriate? I think bs->bl.zoned should always be
available, even when <blkzoned.h> is missing (e.g. non-Linux system). In
the future there may be an emulated zoned BlockDriver. virtio-blk
should be able to use the emulated zoned BlockDriver even on non-Linux
systems.

I think CONFIG_BLKZONED should be limited to block/file-posix.c.

> +    if (state->bl.zoned != BLK_Z_NONE) {
> +        switch (state->bl.zoned) {
> +        case BLK_Z_HM:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> +            virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> +                         state->bl.zone_sectors);
> +            virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> +                         state->bl.max_active_zones);
> +            virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> +                         state->bl.max_open_zones);
> +            virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> +            virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> +                         state->bl.max_append_sectors);
> +            break;
> +        case BLK_Z_HA:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;

The block limits aren't relevant to host-aware zoned devices?

> +            break;
> +        default:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> +            virtio_error(vdev, "Invalid zoned model %x \n", 
> (int)state->bl.zoned);
> +            break;
> +        }
> +    }
> +#endif
>      memcpy(config, &blkcfg, s->config_size);
>  }
>  
> @@ -995,6 +1311,7 @@ static uint64_t virtio_blk_get_features(VirtIODevice 
> *vdev, uint64_t features,
>                                          Error **errp)
>  {
>      VirtIOBlock *s = VIRTIO_BLK(vdev);
> +    BlockDriverState *state = blk_bs(s->blk);
>  
>      /* Firstly sync all virtio-blk possible supported features */
>      features |= s->host_features;
> @@ -1003,6 +1320,12 @@ static uint64_t virtio_blk_get_features(VirtIODevice 
> *vdev, uint64_t features,
>      virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
>      virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
>      virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
> +    if (state->bl.zoned != BLK_Z_NONE) {
> +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> +        if (state->bl.zoned == BLK_Z_HM) {
> +            virtio_clear_feature(&features, VIRTIO_BLK_F_DISCARD);

Why is features modified here but s->host_features is modified in the
line above?

> +        }
> +    }

This detects VIRTIO_BLK_F_ZONED based on the BlockDriverState...

>      if (virtio_has_feature(features, VIRTIO_F_VERSION_1)) {
>          if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_SCSI)) {
>              error_setg(errp, "Please set scsi=off for virtio-blk devices in 
> order to use virtio 1.0");
> @@ -1286,6 +1609,9 @@ static Property virtio_blk_properties[] = {
>  #ifdef __linux__
>      DEFINE_PROP_BIT64("scsi", VirtIOBlock, host_features,
>                        VIRTIO_BLK_F_SCSI, false),
> +#endif
> +#ifdef CONFIG_BLKZONED
> +    DEFINE_PROP_BIT64("zoned", VirtIOBlock, host_features, 
> VIRTIO_BLK_F_ZONED, true),
>  #endif

...but this allows users to enable/disable the flag from the
command-line.

I think DEFINE_PROP_BIT64() can be removed, but please check that the
config space size is still correct. It may be necessary to move the
bs->bl.zoned check into virtio_blk_device_realize() and set
the VIRTIO_BLK_F_ZONED s->host_features bit there instead. That will
allow the s->config_size calculation to work correctly.

Attachment: signature.asc
Description: PGP signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]