qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [RFC 3/4] qcow2: add zoned emulation capability


From: Stefan Hajnoczi
Subject: Re: [RFC 3/4] qcow2: add zoned emulation capability
Date: Mon, 19 Jun 2023 16:35:59 +0200

On Mon, Jun 05, 2023 at 06:41:07PM +0800, Sam Li wrote:
> By adding zone operations and zoned metadata, the zoned emulation
> capability enables full emulation support of zoned device using
> a qcow2 file. The zoned device metadata includes zone type,
> zoned device state and write pointer of each zone, which is stored
> to an array of unsigned integers.
> 
> Each zone of a zoned device makes state transitions following
> the zone state machine. The zone state machine mainly describes
> five states, IMPLICIT OPEN, EXPLICIT OPEN, FULL, EMPTY and CLOSED.
> READ ONLY and OFFLINE states will generally be affected by device
> internal events. The operations on zones cause corresponding state
> changing.
> 
> Zoned devices have a limit on zone resources, which puts constraints on
> write operations into zones.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/qcow2.c | 629 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  block/qcow2.h |   2 +
>  2 files changed, 629 insertions(+), 2 deletions(-)
> 
> diff --git a/block/qcow2.c b/block/qcow2.c
> index b886dab42b..f030965d5d 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -194,6 +194,164 @@ qcow2_extract_crypto_opts(QemuOpts *opts, const char 
> *fmt, Error **errp)
>      return cryptoopts_qdict;
>  }
>  
> +#define QCOW2_ZT_IS_CONV(wp)    (wp & 1ULL << 59)
> +
> +static inline int qcow2_get_wp(uint64_t wp)
> +{
> +    /* clear state and type information */
> +    return ((wp << 5) >> 5);
> +}
> +
> +static inline int qcow2_get_zs(uint64_t wp)
> +{
> +    return (wp >> 60);
> +}
> +
> +static inline void qcow2_set_wp(uint64_t *wp, BlockZoneState zs)
> +{
> +    uint64_t addr = qcow2_get_wp(*wp);
> +    addr |= ((uint64_t)zs << 60);
> +    *wp = addr;
> +}
> +
> +/*
> + * File wp tracking: reset zone, finish zone and append zone can
> + * change the value of write pointer. All zone operations will change
> + * the state of that/those zone.
> + * */
> +static inline void qcow2_wp_tracking_helper(int index, uint64_t wp) {
> +    /* format: operations, the wp. */
> +    printf("wps[%d]: 0x%x\n", index, qcow2_get_wp(wp)>>BDRV_SECTOR_BITS);
> +}
> +
> +/*
> + * Perform a state assignment and a flush operation that writes the new wp
> + * value to the dedicated location of the disk file.
> + */
> +static int qcow2_write_wp_at(BlockDriverState *bs, uint64_t *wp,
> +                             uint32_t index, BlockZoneState zs) {
> +    BDRVQcow2State *s = bs->opaque;
> +    int ret;
> +
> +    qcow2_set_wp(wp, zs);
> +    ret = bdrv_pwrite(bs->file, s->zoned_header.zonedmeta_offset
> +        + sizeof(uint64_t) * index, sizeof(uint64_t), wp, 0);
> +
> +    if (ret < 0) {
> +        goto exit;
> +    }
> +    qcow2_wp_tracking_helper(index, *wp);
> +    return ret;
> +
> +exit:
> +    error_report("Failed to write metadata with file");
> +    return ret;
> +}
> +
> +static int qcow2_check_active(BlockDriverState *bs)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +
> +    if (!s->zoned_header.max_active_zones) {
> +        return 0;
> +    }
> +
> +    if (s->nr_zones_exp_open + s->nr_zones_imp_open + s->nr_zones_closed
> +        < s->zoned_header.max_active_zones) {
> +        return 0;
> +    }
> +
> +    return -1;
> +}
> +
> +static int qcow2_check_open(BlockDriverState *bs)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    int ret;
> +
> +    if (!s->zoned_header.max_open_zones) {
> +        return 0;
> +    }
> +
> +    if (s->nr_zones_exp_open + s->nr_zones_imp_open
> +        < s->zoned_header.max_open_zones) {
> +        return 0;
> +    }
> +
> +    if(s->nr_zones_imp_open) {
> +        ret = qcow2_check_active(bs);
> +        if (ret == 0) {
> +            /* TODO: it takes O(n) time complexity (n = nr_zones).
> +             * Optimizations required. */
> +            /* close one implicitly open zones to make it available */
> +            for (int i = s->zoned_header.zone_nr_conv;
> +            i < bs->bl.nr_zones; ++i) {
> +                uint64_t *wp = &s->wps->wp[i];
> +                if (qcow2_get_zs(*wp) == BLK_ZS_IOPEN) {
> +                    ret = qcow2_write_wp_at(bs, wp, i, BLK_ZS_CLOSED);
> +                    if (ret < 0) {
> +                        return ret;
> +                    }
> +                    s->wps->wp[i] = *wp;
> +                    s->nr_zones_imp_open--;
> +                    s->nr_zones_closed++;
> +                    break;
> +                }
> +            }
> +            return 0;
> +        }
> +        return ret;
> +    }
> +
> +    return -1;
> +}
> +
> +/*
> + * The zoned device has limited zone resources of open, closed, active
> + * zones.
> + */
> +static int qcow2_check_zone_resources(BlockDriverState *bs,
> +                                      BlockZoneState zs)
> +{
> +    int ret;
> +
> +    switch (zs) {
> +    case BLK_ZS_EMPTY:
> +        ret = qcow2_check_active(bs);
> +        if (ret < 0) {
> +            error_report("No enough active zones");
> +            return ret;
> +        }
> +        return ret;
> +    case BLK_ZS_CLOSED:
> +        ret = qcow2_check_open(bs);
> +        if (ret < 0) {
> +            error_report("No enough open zones");
> +            return ret;
> +        }
> +        return ret;
> +    default:
> +        return -EINVAL;
> +    }
> +
> +}
> +
> +static inline int qcow2_refresh_zonedmeta(BlockDriverState *bs)
> +{
> +    int ret;
> +    BDRVQcow2State *s = bs->opaque;
> +    uint64_t *temp = g_malloc(s->zoned_header.zonedmeta_size);
> +    ret = bdrv_pread(bs->file, s->zoned_header.zonedmeta_offset,
> +                     s->zoned_header.zonedmeta_size, temp, 0);
> +    if (ret < 0) {
> +        error_report("Can not read metadata\n");
> +        return ret;
> +    }
> +
> +    memcpy(s->wps->wp, temp, s->zoned_header.zonedmeta_size);
> +    return 0;
> +}
> +
>  /*
>   * read qcow2 extension and fill bs
>   * start reading from start_offset
> @@ -455,7 +613,19 @@ qcow2_read_extensions(BlockDriverState *bs, uint64_t 
> start_offset,
>                  be32_to_cpu(zoned_ext.max_active_zones);
>              zoned_ext.max_append_sectors =
>                  be32_to_cpu(zoned_ext.max_append_sectors);
> +            zoned_ext.zonedmeta_offset =
> +                be64_to_cpu(zoned_ext.zonedmeta_offset);
> +            zoned_ext.zonedmeta_size = be64_to_cpu(zoned_ext.zonedmeta_size);
>              s->zoned_header = zoned_ext;
> +            s->wps = g_malloc(sizeof(BlockZoneWps)
> +                    + s->zoned_header.zonedmeta_size);
> +            ret = qcow2_refresh_zonedmeta(bs);
> +            if (ret < 0) {
> +                error_setg_errno(errp, -ret, "zonedmeta: "
> +                                             "Could not update zoned meta");
> +                return ret;
> +            }
> +            qemu_co_mutex_init(&s->wps->colock);
>  
>  #ifdef DEBUG_EXT
>              printf("Qcow2: Got zoned format extension: "
> @@ -1982,6 +2152,14 @@ static void qcow2_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>      }
>      bs->bl.pwrite_zeroes_alignment = s->subcluster_size;
>      bs->bl.pdiscard_alignment = s->cluster_size;
> +    bs->bl.zoned = s->zoned_header.zoned;
> +    bs->bl.nr_zones = s->zoned_header.nr_zones;
> +    bs->wps = s->wps;
> +    bs->bl.max_append_sectors = s->zoned_header.max_append_sectors;
> +    bs->bl.max_active_zones = s->zoned_header.max_active_zones;
> +    bs->bl.max_open_zones = s->zoned_header.max_open_zones;
> +    bs->bl.zone_size = s->zoned_header.zone_size;
> +    bs->bl.write_granularity = BDRV_SECTOR_SIZE;
>  }
>  
>  static int qcow2_reopen_prepare(BDRVReopenState *state,
> @@ -2672,9 +2850,26 @@ qcow2_co_pwritev_part(BlockDriverState *bs, int64_t 
> offset, int64_t bytes,
>      uint64_t host_offset;
>      QCowL2Meta *l2meta = NULL;
>      AioTaskPool *aio = NULL;
> +    int64_t start_offset, start_bytes;
> +    BlockZoneState zs;
> +    int64_t end;
> +    uint64_t *wp;
> +    int64_t zone_size = bs->bl.zone_size;
> +    int index;
>  
>      trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
>  
> +    start_offset = offset;
> +    start_bytes = bytes;
> +    /* The offset should not less than the wp of that
> +     * zone where offset starts.  */
> +    if (zone_size) {
> +        index = start_offset / zone_size;
> +        wp = &s->wps->wp[index];
> +        if (offset < qcow2_get_wp(*wp)) {
> +            return -EINVAL;
> +        }
> +    }
>      while (bytes != 0 && aio_task_pool_status(aio) == 0) {
>  
>          l2meta = NULL;
> @@ -2720,6 +2915,47 @@ qcow2_co_pwritev_part(BlockDriverState *bs, int64_t 
> offset, int64_t bytes,
>          qiov_offset += cur_bytes;
>          trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
>      }
> +
> +    if (zone_size) {
> +        index = start_offset / zone_size;
> +        wp = &s->wps->wp[index];
> +        uint64_t wpv = *wp;
> +        if (!QCOW2_ZT_IS_CONV(wpv)) {
> +            /*
> +             * Implicitly open one closed zone to write if there are zone 
> resources
> +             * left.
> +             */
> +            zs = qcow2_get_zs(wpv);
> +            if (zs == BLK_ZS_CLOSED || zs == BLK_ZS_EMPTY) {
> +                ret = qcow2_check_zone_resources(bs, zs);
> +                if (ret < 0) {
> +                    goto fail_nometa;
> +                }
> +
> +                if (zs == BLK_ZS_CLOSED) {
> +                    s->nr_zones_closed--;
> +                    s->nr_zones_imp_open++;
> +                } else {
> +                    s->nr_zones_imp_open++;
> +                }
> +            }
> +
> +            /* align up (start_offset, zone_size), the start offset is not
> +             * necessarily power of two. */
> +            end = ((start_offset + zone_size) / zone_size) * zone_size;
> +            if (start_offset + start_bytes <= end) {
> +                *wp = start_offset + start_bytes;
> +            } else {
> +                ret = -EINVAL;
> +                goto fail_nometa;
> +            }
> +
> +            ret = qcow2_write_wp_at(bs, wp, index,BLK_ZS_IOPEN);
> +            if (ret < 0) {
> +                goto fail_nometa;
> +            }
> +        }
> +    }
>      ret = 0;
>  
>      qemu_co_mutex_lock(&s->lock);
> @@ -3117,7 +3353,9 @@ int qcow2_update_header(BlockDriverState *bs)
>              .max_active_zones   =
>                  cpu_to_be32(s->zoned_header.max_active_zones),
>              .max_append_sectors =
> -                cpu_to_be32(s->zoned_header.max_append_sectors)
> +                cpu_to_be32(s->zoned_header.max_append_sectors),
> +            .zonedmeta_offset   = 
> cpu_to_be64(s->zoned_header.zonedmeta_offset),
> +            .zonedmeta_size     = cpu_to_be64(s->zoned_header.zonedmeta_size)
>          };
>          ret = header_ext_add(buf, QCOW2_EXT_MAGIC_ZONED_FORMAT,
>                               &zoned_header, sizeof(zoned_header),
> @@ -3522,7 +3760,8 @@ qcow2_co_create(BlockdevCreateOptions *create_options, 
> Error **errp)
>      int version;
>      int refcount_order;
>      uint64_t *refcount_table;
> -    int ret;
> +    uint64_t zoned_meta_size, zoned_clusterlen;
> +    int ret, offset, i;
>      uint8_t compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
>  
>      assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2);
> @@ -3823,6 +4062,48 @@ qcow2_co_create(BlockdevCreateOptions *create_options, 
> Error **errp)
>          s->zoned_header.max_open_zones = qcow2_opts->max_open_zones;
>          s->zoned_header.max_active_zones = qcow2_opts->max_active_zones;
>          s->zoned_header.max_append_sectors = qcow2_opts->max_append_sectors;
> +        s->zoned_header.nr_zones = qcow2_opts->size / qcow2_opts->zone_size;
> +
> +        zoned_meta_size =  sizeof(uint64_t) * s->zoned_header.nr_zones;
> +        uint64_t meta[zoned_meta_size];

zoned_meta_size is in bytes but the array is in uint64_t. I guess the
array size should be s->zoned_header.nr_zones (in zones) instead of
zoned_meta_size (in bytes).

Please use g_autoptr and g_new() for this to avoid stack overflow issues
if nr_zones is large.

> +        memset(meta, 0, zoned_meta_size);

Unnecessary if you use g_new0(). Also, zeroing is probably unnecessary
since the for loops below fill in every element of the array.

> +
> +        for (i = 0; i < s->zoned_header.zone_nr_conv; ++i) {
> +            meta[i] = i * s->zoned_header.zone_size;
> +            meta[i] += 1ULL << 59;

Bitwise OR ('|') is clearer than addition. You do not rely on or want
the add operation's arithmetic carry here.

> +        }
> +        for (; i < s->zoned_header.nr_zones; ++i) {
> +            meta[i] = i * s->zoned_header.zone_size;
> +            /* For sequential zones, the first four most significant bit
> +             * indicates zone states. */
> +            meta[i] += ((uint64_t)BLK_ZS_EMPTY << 60);

Bitwise OR.

> +        }
> +
> +        offset = qcow2_alloc_clusters(blk_bs(blk), zoned_meta_size);
> +        if (offset < 0) {
> +            error_setg_errno(errp, -offset, "Could not allocate clusters "
> +                                            "for zoned metadata size");
> +            goto out;
> +        }
> +        s->zoned_header.zonedmeta_offset = offset;
> +        s->zoned_header.zonedmeta_size = zoned_meta_size;
> +
> +        zoned_clusterlen = size_to_clusters(s, zoned_meta_size)
> +                * s->cluster_size;
> +        assert(qcow2_pre_write_overlap_check(bs, 0, offset,
> +                                             zoned_clusterlen,false) == 0);
> +        ret = bdrv_pwrite_zeroes(blk_bs(blk)->file, offset,
> +                                 zoned_clusterlen, 0);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Could not zero fill zoned 
> metadata");
> +            goto out;
> +        }
> +        ret = bdrv_pwrite(blk_bs(blk)->file, offset, zoned_meta_size, meta, 
> 0);
> +        if (ret < 0) {
> +            error_setg_errno(errp, -ret, "Could not write zoned metadata "
> +                                         "to disk");
> +            goto out;
> +        }
>      }
>  
>      /* Create a full header (including things like feature table) */
> @@ -4166,6 +4447,346 @@ static coroutine_fn int 
> qcow2_co_pdiscard(BlockDriverState *bs,
>      return ret;
>  }
>  
> +static int coroutine_fn
> +qcow2_co_zone_report(BlockDriverState *bs, int64_t offset,
> +                     unsigned int *nr_zones, BlockZoneDescriptor *zones)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    uint64_t zone_size = s->zoned_header.zone_size;
> +    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> +    int64_t size = bs->bl.nr_zones * zone_size;
> +    int i = 0;
> +    int si;
> +
> +    if (zone_size > 0) {
> +        si = offset / zone_size;

offset must be validated. It might be beyond the capacity of the device.

> +        unsigned int nrz = *nr_zones;

nr_zones must be validated. It might be larger than bs->bl.nr_zones.

> +        qemu_co_mutex_lock(&s->wps->colock);
> +        for (; i < nrz; ++i) {
> +            zones[i].start = (si + i) * zone_size;
> +
> +            /* The last zone can be smaller than the zone size */
> +            if ((si + i + 1) == bs->bl.nr_zones && size > capacity) {
> +                zones[i].length = zone_size - (size - capacity);
> +            } else {
> +                zones[i].length = zone_size;
> +            }
> +            zones[i].cap = zone_size;

Should capacity also be capped for the last zone?

> +
> +            uint64_t wp = s->wps->wp[si + i];
> +            if (QCOW2_ZT_IS_CONV(wp)) {
> +                zones[i].type = BLK_ZT_CONV;
> +                zones[i].state = BLK_ZS_NOT_WP;
> +                /* Clear the zone type bit */
> +                wp &= ~(1ULL << 59);
> +            } else {
> +                zones[i].type = BLK_ZT_SWR;
> +                zones[i].state = qcow2_get_zs(wp);
> +                /* Clear the zone state bits */
> +                wp = qcow2_get_wp(wp);
> +            }
> +
> +            zones[i].wp = wp;
> +            if (si + i == bs->bl.nr_zones) {
> +                break;
> +            }

This check is too late because wp[] has already been indexed. It is
insufficient when the first zone is already out of bounds.

> +        }
> +        qemu_co_mutex_unlock(&s->wps->colock);
> +    }
> +    *nr_zones = i;
> +    return 0;
> +}
> +
> +static int qcow2_open_zone(BlockDriverState *bs, uint32_t index) {
> +    BDRVQcow2State *s = bs->opaque;
> +    int ret;
> +
> +    qemu_co_mutex_lock(&s->wps->colock);
> +    uint64_t *wp = &s->wps->wp[index];
> +    BlockZoneState zs = qcow2_get_zs(*wp);
> +
> +    switch(zs) {
> +    case BLK_ZS_EMPTY:
> +        ret = qcow2_check_zone_resources(bs, BLK_ZS_EMPTY);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        break;
> +    case BLK_ZS_IOPEN:
> +        s->nr_zones_imp_open--;
> +        break;
> +    case BLK_ZS_EOPEN:
> +        return 0;
> +    case BLK_ZS_CLOSED:
> +        ret = qcow2_check_zone_resources(bs, BLK_ZS_CLOSED);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        s->nr_zones_closed--;
> +        break;
> +    case BLK_ZS_FULL:
> +        break;
> +    default:
> +        return -EINVAL;
> +    }

s->wps->colock is not unlocked in the return code paths above.

> +    ret = qcow2_write_wp_at(bs, wp, index, BLK_ZS_EOPEN);

I wanted to confirm with you and Damien that zone states are persisted
in the zoned storage model, even OPEN/CLOSED? To me, OPEN/CLOSED, seem
more related to runtime resource limits than persistent state that needs
to be stored.

> +    if (!ret) {
> +        s->nr_zones_exp_open++;
> +    }
> +    qemu_co_mutex_unlock(&s->wps->colock);
> +    return ret;
> +}
> +
> +static int qcow2_close_zone(BlockDriverState *bs, uint32_t index) {
> +    BDRVQcow2State *s = bs->opaque;
> +    int ret;
> +
> +    qemu_co_mutex_lock(&s->wps->colock);
> +    uint64_t *wp = &s->wps->wp[index];
> +    BlockZoneState zs = qcow2_get_zs(*wp);
> +
> +    switch(zs) {
> +    case BLK_ZS_EMPTY:
> +        break;
> +    case BLK_ZS_IOPEN:
> +        s->nr_zones_imp_open--;
> +        break;
> +    case BLK_ZS_EOPEN:
> +        s->nr_zones_exp_open--;
> +        break;
> +    case BLK_ZS_CLOSED:
> +        ret = qcow2_check_zone_resources(bs, BLK_ZS_CLOSED);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        s->nr_zones_closed--;
> +        break;
> +    case BLK_ZS_FULL:
> +        break;
> +    default:
> +        return -EINVAL;
> +    }

s->wps->colock is not unlocked in the return code paths above.

> +
> +    if (zs == BLK_ZS_EMPTY) {
> +        ret = qcow2_write_wp_at(bs, wp, index, BLK_ZS_EMPTY);
> +    } else {
> +        ret = qcow2_write_wp_at(bs, wp, index, BLK_ZS_CLOSED);
> +        if (!ret) {
> +            s->nr_zones_closed++;
> +        }
> +    }
> +    qemu_co_mutex_unlock(&s->wps->colock);
> +    return ret;
> +}
> +
> +static int qcow2_finish_zone(BlockDriverState *bs, uint32_t index) {
> +    BDRVQcow2State *s = bs->opaque;
> +    int ret;
> +
> +    qemu_co_mutex_lock(&s->wps->colock);
> +    uint64_t *wp = &s->wps->wp[index];
> +    BlockZoneState zs = qcow2_get_zs(*wp);
> +
> +    switch(zs) {
> +    case BLK_ZS_EMPTY:
> +        ret = qcow2_check_zone_resources(bs, BLK_ZS_EMPTY);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        break;
> +    case BLK_ZS_IOPEN:
> +        s->nr_zones_imp_open--;
> +        break;
> +    case BLK_ZS_EOPEN:
> +        s->nr_zones_exp_open--;
> +        break;
> +    case BLK_ZS_CLOSED:
> +        ret = qcow2_check_zone_resources(bs, BLK_ZS_CLOSED);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        s->nr_zones_closed--;
> +        break;
> +    case BLK_ZS_FULL:
> +        return 0;
> +    default:
> +        return -EINVAL;
> +    }

s->wps->colock is not unlocked in the return code paths above.

> +
> +    *wp = (index + 1) * s->zoned_header.zone_size;

There is an integer overflow here. Please see my comment in
qcow2_reset_zone() below.

> +    ret = qcow2_write_wp_at(bs, wp, index, BLK_ZS_FULL);
> +    qemu_co_mutex_unlock(&s->wps->colock);
> +    return ret;
> +}
> +
> +static int qcow2_reset_zone(BlockDriverState *bs, uint32_t index,
> +                            int64_t len) {
> +    BDRVQcow2State *s = bs->opaque;
> +    int nrz = bs->bl.nr_zones;
> +    int zone_size = bs->bl.zone_size;
> +    int n, ret = 0;
> +
> +    qemu_co_mutex_lock(&s->wps->colock);
> +    uint64_t *wp = &s->wps->wp[index];
> +    if (len == bs->total_sectors << BDRV_SECTOR_BITS) {
> +        n = nrz;
> +        index = 0;
> +    } else {
> +        n = len / zone_size;
> +    }
> +
> +    for (int i = 0; i < n; ++i) {
> +        uint64_t *wp_i = (uint64_t *)(wp + i);
> +        uint64_t wpi_v = *wp_i;
> +        if (QCOW2_ZT_IS_CONV(wpi_v)) {
> +            continue;
> +        }
> +        
> +        BlockZoneState zs = qcow2_get_zs(wpi_v);
> +        switch (zs) {
> +        case BLK_ZS_EMPTY:
> +            break;
> +        case BLK_ZS_IOPEN:
> +            s->nr_zones_imp_open--;
> +            break;
> +        case BLK_ZS_EOPEN:
> +            s->nr_zones_exp_open--;
> +            break;
> +        case BLK_ZS_CLOSED:
> +            s->nr_zones_closed--;
> +            break;
> +        case BLK_ZS_FULL:
> +            break;
> +        default:
> +            return -EINVAL;
> +        }
> +
> +        if (zs == BLK_ZS_EMPTY) {
> +            continue;
> +        }
> +
> +        *wp_i = (index + i) * zone_size;

This calculation needs uint64_t to avoid overflowing int. The types
involved are:

  uint64_t = (uint32_t + int) * int;

You can fix it using:

  *wp_i = ((uint64_t)index + i) * zone_size;

Then the entire expression will be evaluated as a 64-bit integer instead
of a 32-bit integer.

> +        ret = qcow2_write_wp_at(bs, wp_i, index + i, BLK_ZS_EMPTY);
> +        if (ret < 0) {
> +            return ret;

s->wps->colock must be unlocked.

> +        }
> +        /* clear data */
> +        ret = qcow2_co_pwrite_zeroes(bs, qcow2_get_wp(*wp_i), zone_size, 0);

Does zone reset guarantee that the data blocks will be zeroed according
to the zoned storage model?

> +        if (ret < 0) {
> +            error_report("Failed to reset zone at 0x%" PRIx64 "", *wp_i);
> +        }
> +    }
> +    qemu_co_mutex_unlock(&s->wps->colock);
> +    return ret;
> +}
> +
> +static int coroutine_fn qcow2_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp 
> op,
> +                                           int64_t offset, int64_t len)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    int ret = 0;
> +    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> +    int64_t zone_size = s->zoned_header.zone_size;
> +    int64_t zone_size_mask = zone_size - 1;
> +    uint32_t index = offset / zone_size;
> +    BlockZoneWps *wps = s->wps;
> +
> +    if (offset & zone_size_mask) {
> +        error_report("sector offset %" PRId64 " is not aligned to zone size"
> +                     " %" PRId64 "", offset / 512, zone_size / 512);
> +        return -EINVAL;
> +    }
> +
> +    if (((offset + len) < capacity && len & zone_size_mask) ||
> +        offset + len > capacity) {
> +        error_report("number of sectors %" PRId64 " is not aligned to zone"
> +                     " size %" PRId64 "", len / 512, zone_size / 512);
> +        return -EINVAL;
> +    }
> +
> +    qemu_co_mutex_lock(&wps->colock);
> +    uint64_t wpv = wps->wp[offset / zone_size];

Use index here instead of recalculating it?

> +    if (QCOW2_ZT_IS_CONV(wpv) && len != capacity) {
> +        error_report("zone mgmt operations are not allowed for "
> +                     "conventional zones");
> +        ret = -EIO;
> +        goto unlock;
> +    }
> +    qemu_co_mutex_unlock(&wps->colock);
> +
> +    switch(op) {
> +    case BLK_ZO_OPEN:
> +        ret = qcow2_open_zone(bs, index);
> +        break;
> +    case BLK_ZO_CLOSE:
> +        ret = qcow2_close_zone(bs, index);
> +        break;
> +    case BLK_ZO_FINISH:
> +        ret = qcow2_finish_zone(bs, index);
> +        break;
> +    case BLK_ZO_RESET:
> +        ret = qcow2_reset_zone(bs, index, len);
> +        break;
> +    default:
> +        error_report("Unsupported zone op: 0x%x", op);
> +        ret = -ENOTSUP;
> +        break;
> +    }
> +    return ret;
> +
> +unlock:
> +    qemu_co_mutex_unlock(&wps->colock);
> +    return ret;
> +}
> +
> +static int coroutine_fn
> +qcow2_co_zone_append(BlockDriverState *bs, int64_t *offset, QEMUIOVector 
> *qiov,
> +                     BdrvRequestFlags flags)
> +{
> +    assert(flags == 0);
> +    BDRVQcow2State *s = bs->opaque;
> +    int ret;
> +    int64_t zone_size_mask = bs->bl.zone_size - 1;
> +    int64_t iov_len = 0;
> +    int64_t len = 0;
> +
> +    /* offset + len should not pass the end of that zone starting from 
> offset */
> +    if (*offset & zone_size_mask) {
> +        error_report("sector offset %" PRId64 " is not aligned to zone size "
> +                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> +        return -EINVAL;
> +    }
> +
> +    int64_t wg = bs->bl.write_granularity;
> +    int64_t wg_mask = wg - 1;
> +    for (int i = 0; i < qiov->niov; i++) {
> +        iov_len = qiov->iov[i].iov_len;
> +        if (iov_len & wg_mask) {
> +            error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
> +                         "block size %" PRId64 "", i, iov_len, wg);
> +            return -EINVAL;
> +        }
> +    }
> +    len = qiov->size;
> +
> +    if ((len >> BDRV_SECTOR_BITS) > bs->bl.max_append_sectors) {
> +        return -ENOTSUP;
> +    }
> +
> +    qemu_co_mutex_lock(&s->wps->colock);
> +    uint64_t wp = s->wps->wp[*offset / bs->bl.zone_size];

Where is *offset checked against nr_zones to prevent an access beyond
the end of the array?

> +    uint64_t wp_i = qcow2_get_wp(wp);
> +    ret = qcow2_co_pwritev_part(bs, wp_i, len, qiov, 0, 0);
> +    if (ret == 0) {
> +        *offset = wp_i;
> +    } else {
> +        error_report("qcow2: zap failed");
> +    }
> +
> +    qemu_co_mutex_unlock(&s->wps->colock);
> +    return ret;
> +}
> +
>  static int coroutine_fn GRAPH_RDLOCK
>  qcow2_co_copy_range_from(BlockDriverState *bs,
>                           BdrvChild *src, int64_t src_offset,
> @@ -6214,6 +6835,10 @@ BlockDriver bdrv_qcow2 = {
>      .bdrv_co_pwritev_part   = qcow2_co_pwritev_part,
>      .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
>  
> +    .bdrv_co_zone_report    = qcow2_co_zone_report,
> +    .bdrv_co_zone_mgmt    = qcow2_co_zone_mgmt,
> +    .bdrv_co_zone_append    = qcow2_co_zone_append,
> +
>      .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
>      .bdrv_co_pdiscard       = qcow2_co_pdiscard,
>      .bdrv_co_copy_range_from = qcow2_co_copy_range_from,
> diff --git a/block/qcow2.h b/block/qcow2.h
> index fe18dc4d97..a3a96ddbce 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -246,6 +246,8 @@ typedef struct Qcow2ZonedHeaderExtension {
>      uint32_t max_active_zones;
>      uint32_t max_open_zones;
>      uint32_t max_append_sectors;
> +    uint64_t zonedmeta_offset;
> +    uint64_t zonedmeta_size;
>      uint8_t padding[3];
>  } QEMU_PACKED Qcow2ZonedHeaderExtension;
>  
> -- 
> 2.40.1
> 

Attachment: signature.asc
Description: PGP signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]