block: introduce zone_write_granularity limit

Per ZBC and ZAC specifications, host-managed SMR hard-disks mandate that all writes into sequential write required zones be aligned to the device physical block size. However, NVMe ZNS does not have this constraint and allows write operations into sequential zones to be aligned to the device logical block size. This inconsistency does not help with software portability across device types. To solve this, introduce the zone_write_granularity queue limit to indicate the alignment constraint, in bytes, of write operations into zones of a zoned block device. This new limit is exported as a read-only sysfs queue attribute and the helper blk_queue_zone_write_granularity() introduced for drivers to set this limit. The function blk_queue_set_zoned() is modified to set this new limit to the device logical block size by default. NVMe ZNS devices as well as zoned nullb devices use this default value as is. The scsi disk driver is modified to execute the blk_queue_zone_write_granularity() helper to set the zone write granularity of host-managed SMR disks to the disk physical block size. The accessor functions queue_zone_write_granularity() and bdev_zone_write_granularity() are also introduced. Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@edc.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-07-23 04:33:26 +02:00 · 2021-01-28 13:47:30 +09:00
parent eafc63a9f7
commit a805a4fa4f
5 changed files with 74 additions and 1 deletions
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -60,6 +60,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->io_opt = 0;
 	lim->misaligned = 0;
 	lim->zoned = BLK_ZONED_NONE;
+	lim->zone_write_granularity = 0;
 }
 EXPORT_SYMBOL(blk_set_default_limits);

@@ -366,6 +367,28 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
 }
 EXPORT_SYMBOL(blk_queue_physical_block_size);

+/**
+ * blk_queue_zone_write_granularity - set zone write granularity for the queue
+ * @q:  the request queue for the zoned device
+ * @size:  the zone write granularity size, in bytes
+ *
+ * Description:
+ *   This should be set to the lowest possible size allowing to write in
+ *   sequential zones of a zoned block device.
+ */
+void blk_queue_zone_write_granularity(struct request_queue *q,
+				      unsigned int size)
+{
+	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
+		return;
+
+	q->limits.zone_write_granularity = size;
+
+	if (q->limits.zone_write_granularity < q->limits.logical_block_size)
+		q->limits.zone_write_granularity = q->limits.logical_block_size;
+}
+EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity);
+
 /**
 * blk_queue_alignment_offset - set physical block alignment offset
 * @q:	the request queue for the device
@@ -631,6 +654,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			t->discard_granularity;
 	}

+	t->zone_write_granularity = max(t->zone_write_granularity,
+					b->zone_write_granularity);
 	t->zoned = max(t->zoned, b->zoned);
 	return ret;
 }
@@ -847,6 +872,8 @@ EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
 */
 void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
 {
+	struct request_queue *q = disk->queue;
+
 	switch (model) {
 	case BLK_ZONED_HM:
 		/*
@@ -875,7 +902,15 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
 		break;
 	}

-	disk->queue->limits.zoned = model;
+	q->limits.zoned = model;
+	if (model != BLK_ZONED_NONE) {
+		/*
+		 * Set the zone write granularity to the device logical block
+		 * size by default. The driver can change this value if needed.
+		 */
+		blk_queue_zone_write_granularity(q,
+						queue_logical_block_size(q));
+	}
 }
 EXPORT_SYMBOL_GPL(blk_queue_set_zoned);

--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -219,6 +219,12 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
 		(unsigned long long)q->limits.max_write_zeroes_sectors << 9);
 }

+static ssize_t queue_zone_write_granularity_show(struct request_queue *q,
+						 char *page)
+{
+	return queue_var_show(queue_zone_write_granularity(q), page);
+}
+
 static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
 {
 	unsigned long long max_sectors = q->limits.max_zone_append_sectors;
@@ -585,6 +591,7 @@ QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
+QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");

 QUEUE_RO_ENTRY(queue_zoned, "zoned");
 QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
@@ -639,6 +646,7 @@ static struct attribute *queue_attrs[] = {
 	&queue_write_same_max_entry.attr,
 	&queue_write_zeroes_max_entry.attr,
 	&queue_zone_append_max_entry.attr,
+	&queue_zone_write_granularity_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_zoned_entry.attr,
 	&queue_nr_zones_entry.attr,