block: Export I/O topology for block devices and partitions To support devices with physical block sizes bigger than 512 bytes we need to ensure proper alignment. This patch adds support for exposing I/O topology characteristics as devices are stacked. hardsect_size remains unchanged. It is the smallest atomic unit the device can address (i.e. logical block size). io_granularity indicates the smallest I/O the device can access without incurring a read-modify-write penalty. The granularity is set by low-level drivers from then on it is purely internal to the stacking logic. The io_min parameter is the smallest preferred I/O size reported by the device. In many cases this is the same as granularity. However, the io_min parameter can be scaled up when stacking (RAID5 chunk size > physical sector size). io_min is available in sysfs (minimum_io_size). The io_opt characteristic indicates the optimal I/O size reported by the device. This is usually the stripe width for arrays. The value is in sysfs (optimal_io_size). The io_alignment parameter indicates the number of bytes the start of the device/partition is offset from the device granularity. Partition tools and MD/DM tools can use this to align filesystems to the proper boundaries. Signed-off-by: Martin K. Petersen --- diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -60,3 +60,44 @@ Description: Indicates whether the block layer should automatically generate checksums for write requests bound for devices that support receiving integrity metadata. + +What: /sys/block//alignment +Date: April 2009 +Contact: Martin K. Petersen +Description: + Storage devices may report a physical block size that is + bigger than the logical block size (for instance a drive + with 4KB physical sectors exposing 512-byte logical + blocks to the operating system). This parameter + indicates how many bytes the beginning of the device are + offset from the disk's natural alignment. + +What: /sys/block///alignment +Date: April 2009 +Contact: Martin K. Petersen +Description: + Storage devices may report a physical block size that is + bigger than the logical block size (for instance a drive + with 4KB physical sectors exposing 512-byte logical + blocks to the operating system). This parameter + indicates how many bytes the beginning of the partition + are offset from the disk's natural alignment. + +What: /sys/block//queue/minimum_io_size +Date: April 2009 +Contact: Martin K. Petersen +Description: + Storage devices may report a preferred minimum I/O size, + which is the smallest request the device can perform + without incurring a read-modify-write penalty. For disk + drives this is often the physical block size. For RAID + arrays it is often the stripe chunk size. + +What: /sys/block//queue/optimal_io_size +Date: April 2009 +Contact: Martin K. Petersen +Description: + Storage devices may report an optimal I/O size, which is + the device's preferred unit of receiving I/O. This is + rarely reported for disk drives. For RAID devices it is + usually the stripe width or the internal block size. diff --git a/block/blk-settings.c b/block/blk-settings.c --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -301,16 +301,99 @@ EXPORT_SYMBOL(blk_queue_max_segment_size * * Description: * This should typically be set to the lowest possible sector size - * that the hardware can operate on (possible without reverting to - * even internal read-modify-write operations). Usually the default - * of 512 covers most hardware. - **/ + * (logical block size) that the hardware can operate on. Usually the + * default of 512 covers most hardware. + */ void blk_queue_hardsect_size(struct request_queue *q, unsigned short size) { q->limits.hardsect_size = size; + + if (q->limits.io_granularity < size) + q->limits.io_granularity = size; + + if (q->limits.io_min < q->limits.io_granularity) + q->limits.io_min = q->limits.io_granularity; } EXPORT_SYMBOL(blk_queue_hardsect_size); +/** + * blk_queue_io_granularity - set I/O granularity for the queue + * @q: the request queue for the device + * @size: the I/O granularity, in bytes + * + * Description: + * This should typically be set to the lowest possible sector size + * that the hardware can operate on without reverting to + * read-modify-write operations. + */ +void blk_queue_io_granularity(struct request_queue *q, unsigned short size) +{ + q->limits.io_granularity = size; + + if (q->limits.io_granularity < q->limits.hardsect_size) + q->limits.io_granularity = q->limits.hardsect_size; + + if (q->limits.io_min < q->limits.io_granularity) + q->limits.io_min = q->limits.io_granularity; +} +EXPORT_SYMBOL(blk_queue_io_granularity); + +/** + * blk_queue_io_alignment - set physical block alignment for the queue + * @q: the request queue for the device + * @alignment: alignment offset in bytes + * + * Description: + * Some devices are naturally misaligned to compensate for things like + * the legacy DOS partition table 63-sector offset. Low-level drivers + * should call this function for devices whose first sector is not + * naturally aligned. + */ +void blk_queue_io_alignment(struct request_queue *q, unsigned int alignment) +{ + q->limits.io_alignment = alignment & (q->limits.io_granularity - 1); + q->limits.misaligned = 0; +} +EXPORT_SYMBOL(blk_queue_io_alignment); + +/** + * blk_queue_io_min - set minimum request size for the queue + * @q: the request queue for the device + * @io_min: smallest I/O size in bytes + * + * Description: + * Some devices have an internal block size bigger than the reported + * hardware sector size. This function can be used to signal the + * smallest I/O the device can perform without incurring a performance + * penalty. + */ +void blk_queue_io_min(struct request_queue *q, unsigned int min) +{ + q->limits.io_min = min; + + if (q->limits.io_min < q->limits.hardsect_size) + q->limits.io_min = q->limits.hardsect_size; + + if (q->limits.io_min < q->limits.io_granularity) + q->limits.io_min = q->limits.io_granularity; +} +EXPORT_SYMBOL(blk_queue_io_min); + +/** + * blk_queue_io_opt - set optimal request size for the queue + * @q: the request queue for the device + * @io_opt: optimal request size in bytes + * + * Description: + * Drivers can call this function to set the preferred I/O request + * size for devices that report such a value. + */ +void blk_queue_io_opt(struct request_queue *q, unsigned int opt) +{ + q->limits.io_opt = opt; +} +EXPORT_SYMBOL(blk_queue_io_opt); + /* * Returns the minimum that is _not_ zero, unless both are zero. */ @@ -357,6 +440,102 @@ void blk_queue_stack_limits(struct reque EXPORT_SYMBOL(blk_queue_stack_limits); /** + * blk_stack_limits - adjust queue_limits for stacked devices + * @t: the stacking driver limits (top) + * @bdev: the underlying queue limits (bottom) + * @offset: offset to beginning of data within component device + * + * Description: + * Merges two queue_limit structs. Returns 0 if alignment didn't + * change. Returns -1 if adding the bottom device caused + * misalignment. + */ +int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, + sector_t offset) +{ + t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); + t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); + + t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, + b->seg_boundary_mask); + + t->max_phys_segments = min_not_zero(t->max_phys_segments, + b->max_phys_segments); + + t->max_hw_segments = min_not_zero(t->max_hw_segments, + b->max_hw_segments); + + t->max_segment_size = min_not_zero(t->max_segment_size, + b->max_segment_size); + + t->hardsect_size = max(t->hardsect_size, b->hardsect_size); + t->io_min = max(t->io_min, b->io_min); + t->io_granularity = max(t->io_granularity, b->io_granularity); + + t->no_cluster |= b->no_cluster; + + /* Bottom device offset aligned? */ + if (offset && (offset & (b->io_granularity - 1)) != b->io_alignment) { + t->misaligned = 1; + return -1; + } + + /* If top has no alignment, inherit from bottom */ + if (!t->io_alignment) + t->io_alignment = b->io_alignment & (b->io_granularity - 1); + + /* Top alignment on logical block boundary? */ + if (t->io_alignment & (t->hardsect_size - 1)) { + t->misaligned = 1; + return -1; + } + + return 0; +} + +/** + * disk_stack_limits - adjust queue limits for stacked drivers + * @t: MD/DM gendisk (top) + * @bdev: the underlying block device (bottom) + * @offset: offset to beginning of data within component device + * + * Description: + * Merges the limits for two queues. Returns 0 if alignment + * didn't change. Returns -1 if adding the bottom device caused + * misalignment. + */ +void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, + sector_t offset) +{ + struct request_queue *t = disk->queue; + struct request_queue *b = bdev_get_queue(bdev); + + offset += get_start_sect(bdev) << 9; + + if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) { + char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; + + disk_name(disk, 0, top); + bdevname(bdev, bottom); + + printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", + top, bottom); + } + + if (!t->queue_lock) + WARN_ON_ONCE(1); + else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { + unsigned long flags; + + spin_lock_irqsave(t->queue_lock, flags); + if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) + queue_flag_clear(QUEUE_FLAG_CLUSTER, t); + spin_unlock_irqrestore(t->queue_lock, flags); + } +} +EXPORT_SYMBOL(disk_stack_limits); + +/** * blk_queue_dma_pad - set pad mask * @q: the request queue for the device * @mask: pad mask diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -105,6 +105,16 @@ static ssize_t queue_hw_sector_size_show return queue_var_show(queue_hardsect_size(q), page); } +static ssize_t queue_io_min_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_io_min(q), page); +} + +static ssize_t queue_io_opt_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_io_opt(q), page); +} + static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { @@ -256,6 +266,16 @@ static struct queue_sysfs_entry queue_hw .show = queue_hw_sector_size_show, }; +static struct queue_sysfs_entry queue_io_min_entry = { + .attr = {.name = "minimum_io_size", .mode = S_IRUGO }, + .show = queue_io_min_show, +}; + +static struct queue_sysfs_entry queue_io_opt_entry = { + .attr = {.name = "optimal_io_size", .mode = S_IRUGO }, + .show = queue_io_opt_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, .show = queue_nonrot_show, @@ -287,6 +307,8 @@ static struct attribute *default_attrs[] &queue_max_sectors_entry.attr, &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, + &queue_io_min_entry.attr, + &queue_io_opt_entry.attr, &queue_nonrot_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, diff --git a/block/genhd.c b/block/genhd.c --- a/block/genhd.c +++ b/block/genhd.c @@ -848,11 +848,20 @@ static ssize_t disk_capability_show(stru return sprintf(buf, "%x\n", disk->flags); } +static ssize_t disk_alignment_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", queue_io_alignment(disk->queue)); +} + static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); +static DEVICE_ATTR(alignment, S_IRUGO, disk_alignment_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -871,6 +880,7 @@ static struct attribute *disk_attrs[] = &dev_attr_removable.attr, &dev_attr_ro.attr, &dev_attr_size.attr, + &dev_attr_alignment.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST diff --git a/fs/partitions/check.c b/fs/partitions/check.c --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *de return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); } +ssize_t part_alignment_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%llu\n", (unsigned long long)p->alignment); +} + ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *d static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); +static DEVICE_ATTR(alignment, S_IRUGO, part_alignment_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = @@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, + &dev_attr_alignment.attr, &dev_attr_stat.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, @@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct g pdev = part_to_dev(p); p->start_sect = start; + p->alignment = queue_sector_alignment(disk->queue, start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -324,10 +324,16 @@ struct queue_limits { unsigned int max_sectors; unsigned int max_segment_size; + unsigned int io_alignment; + unsigned int io_granularity; + unsigned int io_min; + unsigned int io_opt; + unsigned short hardsect_size; unsigned short max_hw_segments; unsigned short max_phys_segments; + unsigned char misaligned; unsigned char no_cluster; }; @@ -886,7 +892,16 @@ extern void blk_queue_max_phys_segments( extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_hardsect_size(struct request_queue *, unsigned short); +extern void blk_queue_io_granularity(struct request_queue *, unsigned short); +extern void blk_queue_io_alignment(struct request_queue *q, + unsigned int alignment); +extern void blk_queue_io_min(struct request_queue *q, unsigned int min); +extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); +extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, + sector_t offset); +extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, + sector_t offset); extern void blk_queue_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern int blk_queue_dma_drain(struct request_queue *q, @@ -1022,6 +1037,39 @@ static inline unsigned short bdev_hardse return queue_hardsect_size(bdev_get_queue(bdev)); } +static inline unsigned int queue_io_min(struct request_queue *q) +{ + return q->limits.io_min; +} + +static inline unsigned int queue_io_opt(struct request_queue *q) +{ + return q->limits.io_opt; +} + +static inline unsigned int queue_io_granularity(struct request_queue *q) +{ + return q->limits.io_granularity; +} + +static inline int queue_io_alignment(struct request_queue *q) +{ + if (q && q->limits.misaligned) + return -1; + + if (q && q->limits.io_alignment) + return q->limits.io_alignment; + + return 0; +} + +static inline int queue_sector_alignment(struct request_queue *q, + sector_t sector) +{ + return ((sector << 9) - q->limits.io_alignment) + & (q->limits.io_min - 1); +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; diff --git a/include/linux/genhd.h b/include/linux/genhd.h --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -90,6 +90,7 @@ struct disk_stats { struct hd_struct { sector_t start_sect; sector_t nr_sects; + sector_t alignment; struct device __dev; struct kobject *holder_dir; int policy, partno;