@ -71,9 +71,8 @@
*/
static int max_queued_requests = 1024 ;
static void allow_barrier ( struct r1conf * conf , sector_t start_next_window ,
sector_t bi_sector ) ;
static void lower_barrier ( struct r1conf * conf ) ;
static void allow_barrier ( struct r1conf * conf , sector_t sector_nr ) ;
static void lower_barrier ( struct r1conf * conf , sector_t sector_nr ) ;
# define raid1_log(md, fmt, args...) \
do { if ( ( md ) - > queue ) blk_add_trace_msg ( ( md ) - > queue , " raid1 " fmt , # # args ) ; } while ( 0 )
@ -100,7 +99,6 @@ static void r1bio_pool_free(void *r1_bio, void *data)
# define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
# define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
# define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
# define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
static void * r1buf_pool_alloc ( gfp_t gfp_flags , void * data )
{
@ -205,6 +203,7 @@ static void free_r1bio(struct r1bio *r1_bio)
static void put_buf ( struct r1bio * r1_bio )
{
struct r1conf * conf = r1_bio - > mddev - > private ;
sector_t sect = r1_bio - > sector ;
int i ;
for ( i = 0 ; i < conf - > raid_disks * 2 ; i + + ) {
@ -215,7 +214,7 @@ static void put_buf(struct r1bio *r1_bio)
mempool_free ( r1_bio , conf - > r1buf_pool ) ;
lower_barrier ( conf ) ;
lower_barrier ( conf , sect ) ;
}
static void reschedule_retry ( struct r1bio * r1_bio )
@ -223,10 +222,12 @@ static void reschedule_retry(struct r1bio *r1_bio)
unsigned long flags ;
struct mddev * mddev = r1_bio - > mddev ;
struct r1conf * conf = mddev - > private ;
int idx ;
idx = sector_to_idx ( r1_bio - > sector ) ;
spin_lock_irqsave ( & conf - > device_lock , flags ) ;
list_add ( & r1_bio - > retry_list , & conf - > retry_list ) ;
conf - > nr_queued + + ;
atomic_inc ( & conf - > nr_queued [ idx ] ) ;
spin_unlock_irqrestore ( & conf - > device_lock , flags ) ;
wake_up ( & conf - > wait_barrier ) ;
@ -243,7 +244,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
struct bio * bio = r1_bio - > master_bio ;
int done ;
struct r1conf * conf = r1_bio - > mddev - > private ;
sector_t start_next_window = r1_bio - > start_next_window ;
sector_t bi_sector = bio - > bi_iter . bi_sector ;
if ( bio - > bi_phys_segments ) {
@ -269,7 +269,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
* Wake up any possible resync thread that waits for the device
* to go idle .
*/
allow_barrier ( conf , start_next_window , bi_sector ) ;
allow_barrier ( conf , bi_sector ) ;
}
}
@ -517,6 +517,25 @@ static void raid1_end_write_request(struct bio *bio)
bio_put ( to_put ) ;
}
static sector_t align_to_barrier_unit_end ( sector_t start_sector ,
sector_t sectors )
{
sector_t len ;
WARN_ON ( sectors = = 0 ) ;
/*
* len is the number of sectors from start_sector to end of the
* barrier unit which start_sector belongs to .
*/
len = round_up ( start_sector + 1 , BARRIER_UNIT_SECTOR_SIZE ) -
start_sector ;
if ( len > sectors )
len = sectors ;
return len ;
}
/*
* This routine returns the disk from which the requested read should
* be done . There is a per - array ' next expected sequential IO ' sector
@ -813,166 +832,226 @@ static void flush_pending_writes(struct r1conf *conf)
*/
static void raise_barrier ( struct r1conf * conf , sector_t sector_nr )
{
int idx = sector_to_idx ( sector_nr ) ;
spin_lock_irq ( & conf - > resync_lock ) ;
/* Wait until no block IO is waiting */
wait_event_lock_irq ( conf - > wait_barrier , ! conf - > nr_waiting ,
wait_event_lock_irq ( conf - > wait_barrier ,
! atomic_read ( & conf - > nr_waiting [ idx ] ) ,
conf - > resync_lock ) ;
/* block any new IO from starting */
conf - > barrier + + ;
conf - > next_resync = sector_nr ;
atomic_inc ( & conf - > barrier [ idx ] ) ;
/*
* In raise_barrier ( ) we firstly increase conf - > barrier [ idx ] then
* check conf - > nr_pending [ idx ] . In _wait_barrier ( ) we firstly
* increase conf - > nr_pending [ idx ] then check conf - > barrier [ idx ] .
* A memory barrier here to make sure conf - > nr_pending [ idx ] won ' t
* be fetched before conf - > barrier [ idx ] is increased . Otherwise
* there will be a race between raise_barrier ( ) and _wait_barrier ( ) .
*/
smp_mb__after_atomic ( ) ;
/* For these conditions we must wait:
* A : while the array is in frozen state
* B : while barrier > = RESYNC_DEPTH , meaning resync reach
* the max count which allowed .
* C : next_resync + RESYNC_SECTORS > start_next_window , meaning
* next resync will reach to the window which normal bios are
* handling .
* D : while there are any active requests in the current window .
* B : while conf - > nr_pending [ idx ] is not 0 , meaning regular I / O
* existing in corresponding I / O barrier bucket .
* C : while conf - > barrier [ idx ] > = RESYNC_DEPTH , meaning reaches
* max resync count which allowed on current I / O barrier bucket .
*/
wait_event_lock_irq ( conf - > wait_barrier ,
! conf - > array_frozen & &
conf - > barrier < RESYNC_DEPTH & &
conf - > current_window_requests = = 0 & &
( conf - > start_next_window > =
conf - > next_resync + RESYNC_SECTORS ) ,
! atomic_read ( & conf - > nr_pending [ idx ] ) & &
atomic_read ( & conf - > barrier [ idx ] ) < RESYNC_DEPTH ,
conf - > resync_lock ) ;
conf - > nr_pending + + ;
atomic_inc ( & conf - > nr_pending [ idx ] ) ;
spin_unlock_irq ( & conf - > resync_lock ) ;
}
static void lower_barrier ( struct r1conf * conf )
static void lower_barrier ( struct r1conf * conf , sector_t sector_nr )
{
unsigned long flags ;
BUG_ON ( conf - > barrier < = 0 ) ;
spin_lock_irqsave ( & conf - > resync_lock , flags ) ;
conf - > barrier - - ;
conf - > nr_pending - - ;
spin_unlock_irqrestore ( & conf - > resync_lock , flags ) ;
int idx = sector_to_idx ( sector_nr ) ;
BUG_ON ( atomic_read ( & conf - > barrier [ idx ] ) < = 0 ) ;
atomic_dec ( & conf - > barrier [ idx ] ) ;
atomic_dec ( & conf - > nr_pending [ idx ] ) ;
wake_up ( & conf - > wait_barrier ) ;
}
static bool need_to_wait_for_sync ( struct r1conf * conf , struct bio * bio )
static void _wait_barrier ( struct r1conf * conf , int idx )
{
bool wait = false ;
/*
* We need to increase conf - > nr_pending [ idx ] very early here ,
* then raise_barrier ( ) can be blocked when it waits for
* conf - > nr_pending [ idx ] to be 0. Then we can avoid holding
* conf - > resync_lock when there is no barrier raised in same
* barrier unit bucket . Also if the array is frozen , I / O
* should be blocked until array is unfrozen .
*/
atomic_inc ( & conf - > nr_pending [ idx ] ) ;
/*
* In _wait_barrier ( ) we firstly increase conf - > nr_pending [ idx ] , then
* check conf - > barrier [ idx ] . In raise_barrier ( ) we firstly increase
* conf - > barrier [ idx ] , then check conf - > nr_pending [ idx ] . A memory
* barrier is necessary here to make sure conf - > barrier [ idx ] won ' t be
* fetched before conf - > nr_pending [ idx ] is increased . Otherwise there
* will be a race between _wait_barrier ( ) and raise_barrier ( ) .
*/
smp_mb__after_atomic ( ) ;
if ( conf - > array_frozen | | ! bio )
wait = true ;
else if ( conf - > barrier & & bio_data_dir ( bio ) = = WRITE ) {
if ( ( conf - > mddev - > curr_resync_completed
> = bio_end_sector ( bio ) ) | |
( conf - > start_next_window + NEXT_NORMALIO_DISTANCE
< = bio - > bi_iter . bi_sector ) )
wait = false ;
else
wait = true ;
}
/*
* Don ' t worry about checking two atomic_t variables at same time
* here . If during we check conf - > barrier [ idx ] , the array is
* frozen ( conf - > array_frozen is 1 ) , and chonf - > barrier [ idx ] is
* 0 , it is safe to return and make the I / O continue . Because the
* array is frozen , all I / O returned here will eventually complete
* or be queued , no race will happen . See code comment in
* frozen_array ( ) .
*/
if ( ! READ_ONCE ( conf - > array_frozen ) & &
! atomic_read ( & conf - > barrier [ idx ] ) )
return ;
return wait ;
/*
* After holding conf - > resync_lock , conf - > nr_pending [ idx ]
* should be decreased before waiting for barrier to drop .
* Otherwise , we may encounter a race condition because
* raise_barrer ( ) might be waiting for conf - > nr_pending [ idx ]
* to be 0 at same time .
*/
spin_lock_irq ( & conf - > resync_lock ) ;
atomic_inc ( & conf - > nr_waiting [ idx ] ) ;
atomic_dec ( & conf - > nr_pending [ idx ] ) ;
/*
* In case freeze_array ( ) is waiting for
* get_unqueued_pending ( ) = = extra
*/
wake_up ( & conf - > wait_barrier ) ;
/* Wait for the barrier in same barrier unit bucket to drop. */
wait_event_lock_irq ( conf - > wait_barrier ,
! conf - > array_frozen & &
! atomic_read ( & conf - > barrier [ idx ] ) ,
conf - > resync_lock ) ;
atomic_inc ( & conf - > nr_pending [ idx ] ) ;
atomic_dec ( & conf - > nr_waiting [ idx ] ) ;
spin_unlock_irq ( & conf - > resync_lock ) ;
}
static sector_t wait_barrier ( struct r1conf * conf , struct bio * bio )
static void wait_read _barrier( struct r1conf * conf , sector_t sector_nr )
{
sector_t sector = 0 ;
int idx = sector_to_idx ( sector_nr ) ;
/*
* Very similar to _wait_barrier ( ) . The difference is , for read
* I / O we don ' t need wait for sync I / O , but if the whole array
* is frozen , the read I / O still has to wait until the array is
* unfrozen . Since there is no ordering requirement with
* conf - > barrier [ idx ] here , memory barrier is unnecessary as well .
*/
atomic_inc ( & conf - > nr_pending [ idx ] ) ;
if ( ! READ_ONCE ( conf - > array_frozen ) )
return ;
spin_lock_irq ( & conf - > resync_lock ) ;
if ( need_to_wait_for_sync ( conf , bio ) ) {
conf - > nr_waiting + + ;
/* Wait for the barrier to drop.
* However if there are already pending
* requests ( preventing the barrier from
* rising completely ) , and the
* per - process bio queue isn ' t empty ,
* then don ' t wait , as we need to empty
* that queue to allow conf - > start_next_window
* to increase .
*/
raid1_log ( conf - > mddev , " wait barrier " ) ;
atomic_inc ( & conf - > nr_waiting [ idx ] ) ;
atomic_dec ( & conf - > nr_pending [ idx ] ) ;
/*
* In case freeze_array ( ) is waiting for
* get_unqueued_pending ( ) = = extra
*/
wake_up ( & conf - > wait_barrier ) ;
/* Wait for array to be unfrozen */
wait_event_lock_irq ( conf - > wait_barrier ,
! conf - > array_frozen & &
( ! conf - > barrier | |
( ( conf - > start_next_window <
conf - > next_resync + RESYNC_SECTORS ) & &
current - > bio_list & &
! bio_list_empty ( current - > bio_list ) ) ) ,
! conf - > array_frozen ,
conf - > resync_lock ) ;
conf - > nr_waiting - - ;
}
if ( bio & & bio_data_dir ( bio ) = = WRITE ) {
if ( bio - > bi_iter . bi_sector > = conf - > next_resync ) {
if ( conf - > start_next_window = = MaxSector )
conf - > start_next_window =
conf - > next_resync +
NEXT_NORMALIO_DISTANCE ;
atomic_inc ( & conf - > nr_pending [ idx ] ) ;
atomic_dec ( & conf - > nr_waiting [ idx ] ) ;
spin_unlock_irq ( & conf - > resync_lock ) ;
}
if ( ( conf - > start_next_window + NEXT_NORMALIO_DISTANCE )
< = bio - > bi_iter . bi_sector )
conf - > next_window_requests + + ;
else
conf - > current_window_requests + + ;
sector = conf - > start_next_window ;
}
}
static void wait_barrier ( struct r1conf * conf , sector_t sector_nr )
{
int idx = sector_to_idx ( sector_nr ) ;
conf - > nr_pending + + ;
spin_unlock_irq ( & conf - > resync_lock ) ;
return sector ;
_wait_barrier ( conf , idx ) ;
}
static void allow_barrier ( struct r1conf * conf , sector_t start_next_window ,
sector_t bi_sector )
static void wait_all_barriers ( struct r1conf * conf )
{
unsigned long flags ;
int idx ;
spin_lock_irqsave ( & conf - > resync_lock , flags ) ;
conf - > nr_pending - - ;
if ( start_next_window ) {
if ( start_next_window = = conf - > start_next_window ) {
if ( conf - > start_next_window + NEXT_NORMALIO_DISTANCE
< = bi_sector )
conf - > next_window_requests - - ;
else
conf - > current_window_requests - - ;
} else
conf - > current_window_requests - - ;
if ( ! conf - > current_window_requests ) {
if ( conf - > next_window_requests ) {
conf - > current_window_requests =
conf - > next_window_requests ;
conf - > next_window_requests = 0 ;
conf - > start_next_window + =
NEXT_NORMALIO_DISTANCE ;
} else
conf - > start_next_window = MaxSector ;
}
}
spin_unlock_irqrestore ( & conf - > resync_lock , flags ) ;
for ( idx = 0 ; idx < BARRIER_BUCKETS_NR ; idx + + )
_wait_barrier ( conf , idx ) ;
}
static void _allow_barrier ( struct r1conf * conf , int idx )
{
atomic_dec ( & conf - > nr_pending [ idx ] ) ;
wake_up ( & conf - > wait_barrier ) ;
}
static void allow_barrier ( struct r1conf * conf , sector_t sector_nr )
{
int idx = sector_to_idx ( sector_nr ) ;
_allow_barrier ( conf , idx ) ;
}
static void allow_all_barriers ( struct r1conf * conf )
{
int idx ;
for ( idx = 0 ; idx < BARRIER_BUCKETS_NR ; idx + + )
_allow_barrier ( conf , idx ) ;
}
/* conf->resync_lock should be held */
static int get_unqueued_pending ( struct r1conf * conf )
{
int idx , ret ;
for ( ret = 0 , idx = 0 ; idx < BARRIER_BUCKETS_NR ; idx + + )
ret + = atomic_read ( & conf - > nr_pending [ idx ] ) -
atomic_read ( & conf - > nr_queued [ idx ] ) ;
return ret ;
}
static void freeze_array ( struct r1conf * conf , int extra )
{
/* stop syncio and normal IO and wait for everything to
/* Stop sync I/O and normal I/ O and wait for everything to
* go quite .
* We wait until nr_pending match nr_queued + extra
* This is called in the context of one normal IO request
* that has failed . Thus any sync request that might be pending
* will be blocked by nr_pending , and we need to wait for
* pending IO requests to complete or be queued for re - try .
* Thus the number queued ( nr_queued ) plus this request ( extra )
* must match the number of pending IOs ( nr_pending ) before
* we continue .
* This is called in two situations :
* 1 ) management command handlers ( reshape , remove disk , quiesce ) .
* 2 ) one normal I / O request failed .
* After array_frozen is set to 1 , new sync IO will be blocked at
* raise_barrier ( ) , and new normal I / O will blocked at _wait_barrier ( )
* or wait_read_barrier ( ) . The flying I / Os will either complete or be
* queued . When everything goes quite , there are only queued I / Os left .
* Every flying I / O contributes to a conf - > nr_pending [ idx ] , idx is the
* barrier bucket index which this I / O request hits . When all sync and
* normal I / O are queued , sum of all conf - > nr_pending [ ] will match sum
* of all conf - > nr_queued [ ] . But normal I / O failure is an exception ,
* in handle_read_error ( ) , we may call freeze_array ( ) before trying to
* fix the read error . In this case , the error read I / O is not queued ,
* so get_unqueued_pending ( ) = = 1.
*
* Therefore before this function returns , we need to wait until
* get_unqueued_pendings ( conf ) gets equal to extra . For
* normal I / O context , extra is 1 , in rested situations extra is 0.
*/
spin_lock_irq ( & conf - > resync_lock ) ;
conf - > array_frozen = 1 ;
raid1_log ( conf - > mddev , " wait freeze " ) ;
wait_event_lock_irq_cmd ( conf - > wait_barrier ,
conf - > nr_pending = = conf - > nr_queued + extra ,
wait_event_lock_irq_cmd (
conf - > wait_barrier ,
get_unqueued_pending ( conf ) = = extra ,
conf - > resync_lock ,
flush_pending_writes ( conf ) ) ;
spin_unlock_irq ( & conf - > resync_lock ) ;
@ -982,8 +1061,8 @@ static void unfreeze_array(struct r1conf *conf)
/* reverse the effect of the freeze */
spin_lock_irq ( & conf - > resync_lock ) ;
conf - > array_frozen = 0 ;
wake_up ( & conf - > wait_barrier ) ;
spin_unlock_irq ( & conf - > resync_lock ) ;
wake_up ( & conf - > wait_barrier ) ;
}
/* duplicate the data pages for behind I/O
@ -1070,11 +1149,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
kfree ( plug ) ;
}
static void raid1_read_request ( struct mddev * mddev , struct bio * bio ,
struct r1bio * r1_bio )
static inline struct r1bio *
alloc_r1bio ( struct mddev * mddev , struct bio * bio , sector_t sectors_handled )
{
struct r1conf * conf = mddev - > private ;
struct r1bio * r1_bio ;
r1_bio = mempool_alloc ( conf - > r1bio_pool , GFP_NOIO ) ;
r1_bio - > master_bio = bio ;
r1_bio - > sectors = bio_sectors ( bio ) - sectors_handled ;
r1_bio - > state = 0 ;
r1_bio - > mddev = mddev ;
r1_bio - > sector = bio - > bi_iter . bi_sector + sectors_handled ;
return r1_bio ;
}
static void raid1_read_request ( struct mddev * mddev , struct bio * bio )
{
struct r1conf * conf = mddev - > private ;
struct raid1_info * mirror ;
struct r1bio * r1_bio ;
struct bio * read_bio ;
struct bitmap * bitmap = mddev - > bitmap ;
const int op = bio_op ( bio ) ;
@ -1083,8 +1179,29 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
int max_sectors ;
int rdisk ;
wait_barrier ( conf , bio ) ;
/*
* Still need barrier for READ in case that whole
* array is frozen .
*/
wait_read_barrier ( conf , bio - > bi_iter . bi_sector ) ;
r1_bio = alloc_r1bio ( mddev , bio , 0 ) ;
/*
* We might need to issue multiple reads to different
* devices if there are bad blocks around , so we keep
* track of the number of reads in bio - > bi_phys_segments .
* If this is 0 , there is only one r1_bio and no locking
* will be needed when requests complete . If it is
* non - zero , then it is the number of not - completed requests .
*/
bio - > bi_phys_segments = 0 ;
bio_clear_flag ( bio , BIO_SEG_VALID ) ;
/*
* make_request ( ) can abort the operation when read - ahead is being
* used and no empty request is available .
*/
read_again :
rdisk = read_balance ( conf , r1_bio , & max_sectors ) ;
@ -1106,9 +1223,8 @@ read_again:
atomic_read ( & bitmap - > behind_writes ) = = 0 ) ;
}
r1_bio - > read_disk = rdisk ;
r1_bio - > start_next_window = 0 ;
read_bio = bio_clone_mddev ( bio , GFP_NOIO , mddev ) ;
read_bio = bio_clone_fast ( bio , GFP_NOIO , mddev - > bio_set ) ;
bio_trim ( read_bio , r1_bio - > sector - bio - > bi_iter . bi_sector ,
max_sectors ) ;
@ -1151,22 +1267,16 @@ read_again:
*/
reschedule_retry ( r1_bio ) ;
r1_bio = mempool_alloc ( conf - > r1bio_pool , GFP_NOIO ) ;
r1_bio - > master_bio = bio ;
r1_bio - > sectors = bio_sectors ( bio ) - sectors_handled ;
r1_bio - > state = 0 ;
r1_bio - > mddev = mddev ;
r1_bio - > sector = bio - > bi_iter . bi_sector + sectors_handled ;
r1_bio = alloc_r1bio ( mddev , bio , sectors_handled ) ;
goto read_again ;
} else
generic_make_request ( read_bio ) ;
}
static void raid1_write_request ( struct mddev * mddev , struct bio * bio ,
struct r1bio * r1_bio )
static void raid1_write_request ( struct mddev * mddev , struct bio * bio )
{
struct r1conf * conf = mddev - > private ;
struct r1bio * r1_bio ;
int i , disks ;
struct bitmap * bitmap = mddev - > bitmap ;
unsigned long flags ;
@ -1176,7 +1286,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
int first_clone ;
int sectors_handled ;
int max_sectors ;
sector_t start_next_window ;
/*
* Register the new request and wait if the reconstruction
@ -1212,7 +1321,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
}
finish_wait ( & conf - > wait_barrier , & w ) ;
}
start_next_window = wait_barrier ( conf , bio ) ;
wait_barrier ( conf , bio - > bi_iter . bi_sector ) ;
r1_bio = alloc_r1bio ( mddev , bio , 0 ) ;
/* We might need to issue multiple writes to different
* devices if there are bad blocks around , so we keep
* track of the number of writes in bio - > bi_phys_segments .
* If this is 0 , there is only one r1_bio and no locking
* will be needed when requests complete . If it is
* non - zero , then it is the number of not - completed requests .
*/
bio - > bi_phys_segments = 0 ;
bio_clear_flag ( bio , BIO_SEG_VALID ) ;
if ( conf - > pending_count > = max_queued_requests ) {
md_wakeup_thread ( mddev - > thread ) ;
@ -1233,7 +1354,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
disks = conf - > raid_disks * 2 ;
retry_write :
r1_bio - > start_next_window = start_next_window ;
blocked_rdev = NULL ;
rcu_read_lock ( ) ;
max_sectors = r1_bio - > sectors ;
@ -1300,25 +1420,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if ( unlikely ( blocked_rdev ) ) {
/* Wait for this device to become unblocked */
int j ;
sector_t old = start_next_window ;
for ( j = 0 ; j < i ; j + + )
if ( r1_bio - > bios [ j ] )
rdev_dec_pending ( conf - > mirrors [ j ] . rdev , mddev ) ;
r1_bio - > state = 0 ;
allow_barrier ( conf , start_next_window , bio - > bi_iter . bi_sector ) ;
allow_barrier ( conf , bio - > bi_iter . bi_sector ) ;
raid1_log ( mddev , " wait rdev %d blocked " , blocked_rdev - > raid_disk ) ;
md_wait_for_blocked_rdev ( blocked_rdev , mddev ) ;
start_next_window = wait_barrier ( conf , bio ) ;
/*
* We must make sure the multi r1bios of bio have
* the same value of bi_phys_segments
*/
if ( bio - > bi_phys_segments & & old & &
old ! = start_next_window )
/* Wait for the former r1bio(s) to complete */
wait_event ( conf - > wait_barrier ,
bio - > bi_phys_segments = = 1 ) ;
wait_barrier ( conf , bio - > bi_iter . bi_sector ) ;
goto retry_write ;
}
@ -1341,13 +1451,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
first_clone = 1 ;
for ( i = 0 ; i < disks ; i + + ) {
struct bio * mbio ;
struct bio * mbio = NULL ;
sector_t offset ;
if ( ! r1_bio - > bios [ i ] )
continue ;
mbio = bio_clone_mddev ( bio , GFP_NOIO , mddev ) ;
bio_trim ( mbio , r1_bio - > sector - bio - > bi_iter . bi_sector ,
max_sectors ) ;
offset = r1_bio - > sector - bio - > bi_iter . bi_sector ;
if ( first_clone ) {
/* do behind I/O ?
@ -1357,8 +1466,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if ( bitmap & &
( atomic_read ( & bitmap - > behind_writes )
< mddev - > bitmap_info . max_write_behind ) & &
! waitqueue_active ( & bitmap - > behind_wait ) )
! waitqueue_active ( & bitmap - > behind_wait ) ) {
mbio = bio_clone_bioset_partial ( bio , GFP_NOIO ,
mddev - > bio_set ,
offset < < 9 ,
max_sectors < < 9 ) ;
alloc_behind_pages ( mbio , r1_bio ) ;
}
bitmap_startwrite ( bitmap , r1_bio - > sector ,
r1_bio - > sectors ,
@ -1366,6 +1480,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
& r1_bio - > state ) ) ;
first_clone = 0 ;
}
if ( ! mbio ) {
if ( r1_bio - > behind_bvecs )
mbio = bio_clone_bioset_partial ( bio , GFP_NOIO ,
mddev - > bio_set ,
offset < < 9 ,
max_sectors < < 9 ) ;
else {
mbio = bio_clone_fast ( bio , GFP_NOIO , mddev - > bio_set ) ;
bio_trim ( mbio , offset , max_sectors ) ;
}
}
if ( r1_bio - > behind_bvecs ) {
struct bio_vec * bvec ;
int j ;
@ -1385,8 +1512,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
conf - > mirrors [ i ] . rdev - > data_offset ) ;
mbio - > bi_bdev = conf - > mirrors [ i ] . rdev - > bdev ;
mbio - > bi_end_io = raid1_end_write_request ;
mbio - > bi_opf = bio_op ( bio ) |
( bio - > bi_opf & ( REQ_SYNC | REQ_PREFLUSH | REQ_FUA ) ) ;
mbio - > bi_opf = bio_op ( bio ) | ( bio - > bi_opf & ( REQ_SYNC | REQ_FUA ) ) ;
if ( test_bit ( FailFast , & conf - > mirrors [ i ] . rdev - > flags ) & &
! test_bit ( WriteMostly , & conf - > mirrors [ i ] . rdev - > flags ) & &
conf - > raid_disks - mddev - > degraded > 1 )
@ -1427,12 +1553,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
/* We need another r1_bio. It has already been counted
* in bio - > bi_phys_segments
*/
r1_bio = mempool_alloc ( conf - > r1bio_pool , GFP_NOIO ) ;
r1_bio - > master_bio = bio ;
r1_bio - > sectors = bio_sectors ( bio ) - sectors_handled ;
r1_bio - > state = 0 ;
r1_bio - > mddev = mddev ;
r1_bio - > sector = bio - > bi_iter . bi_sector + sectors_handled ;
r1_bio = alloc_r1bio ( mddev , bio , sectors_handled ) ;
goto retry_write ;
}
@ -1444,36 +1565,30 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
static void raid1_make_request ( struct mddev * mddev , struct bio * bio )
{
struct r1conf * conf = mddev - > private ;
struct r1bio * r1_bio ;
/*
* make_request ( ) can abort the operation when read - ahead is being
* used and no empty request is available .
*
*/
r1_bio = mempool_alloc ( conf - > r1bio_pool , GFP_NOIO ) ;
struct bio * split ;
sector_t sectors ;
r1_bio - > master_bio = bio ;
r1_bio - > sectors = bio_sectors ( bio ) ;
r1_bio - > state = 0 ;
r1_bio - > mddev = mddev ;
r1_bio - > sector = bio - > bi_iter . bi_sector ;
if ( unlikely ( bio - > bi_opf & REQ_PREFLUSH ) ) {
md_flush_request ( mddev , bio ) ;
return ;
}
/*
* We might need to issue multiple reads to different devices if there
* are bad blocks around , so we keep track of the number of reads in
* bio - > bi_phys_segments . If this is 0 , there is only one r1_bio and
* no locking will be needed when requests complete . If it is
* non - zero , then it is the number of not - completed requests .
*/
bio - > bi_phys_segments = 0 ;
bio_clear_flag ( bio , BIO_SEG_VALID ) ;
/* if bio exceeds barrier unit boundary, split it */
do {
sectors = align_to_barrier_unit_end (
bio - > bi_iter . bi_sector , bio_sectors ( bio ) ) ;
if ( sectors < bio_sectors ( bio ) ) {
split = bio_split ( bio , sectors , GFP_NOIO , fs_bio_set ) ;
bio_chain ( split , bio ) ;
} else {
split = bio ;
}
if ( bio_data_dir ( bio ) = = READ )
raid1_read_request ( mddev , bio , r1_bio ) ;
if ( bio_data_dir ( split ) = = READ )
raid1_read_request ( mddev , split ) ;
else
raid1_write_request ( mddev , bio , r1_bio ) ;
raid1_write_request ( mddev , split ) ;
} while ( split ! = bio ) ;
}
static void raid1_status ( struct seq_file * seq , struct mddev * mddev )
@ -1564,19 +1679,11 @@ static void print_conf(struct r1conf *conf)
static void close_sync ( struct r1conf * conf )
{
wait_barrier ( conf , NULL ) ;
allow_barrier ( conf , 0 , 0 ) ;
wait_all_ barriers ( conf ) ;
allow_all_ barriers ( conf ) ;
mempool_destroy ( conf - > r1buf_pool ) ;
conf - > r1buf_pool = NULL ;
spin_lock_irq ( & conf - > resync_lock ) ;
conf - > next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE ;
conf - > start_next_window = MaxSector ;
conf - > current_window_requests + =
conf - > next_window_requests ;
conf - > next_window_requests = 0 ;
spin_unlock_irq ( & conf - > resync_lock ) ;
}
static int raid1_spare_active ( struct mddev * mddev )
@ -2273,7 +2380,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
wbio - > bi_vcnt = vcnt ;
} else {
wbio = bio_clone_mddev ( r1_bio - > master_bio , GFP_NOIO , mddev ) ;
wbio = bio_clone_fast ( r1_bio - > master_bio , GFP_NOIO ,
mddev - > bio_set ) ;
}
bio_set_op_attrs ( wbio , REQ_OP_WRITE , 0 ) ;
@ -2323,8 +2431,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
static void handle_write_finished ( struct r1conf * conf , struct r1bio * r1_bio )
{
int m ;
int m , idx ;
bool fail = false ;
for ( m = 0 ; m < conf - > raid_disks * 2 ; m + + )
if ( r1_bio - > bios [ m ] = = IO_MADE_GOOD ) {
struct md_rdev * rdev = conf - > mirrors [ m ] . rdev ;
@ -2350,8 +2459,14 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
if ( fail ) {
spin_lock_irq ( & conf - > device_lock ) ;
list_add ( & r1_bio - > retry_list , & conf - > bio_end_io_list ) ;
conf - > nr_queued + + ;
idx = sector_to_idx ( r1_bio - > sector ) ;
atomic_inc ( & conf - > nr_queued [ idx ] ) ;
spin_unlock_irq ( & conf - > device_lock ) ;
/*
* In case freeze_array ( ) is waiting for condition
* get_unqueued_pending ( ) = = extra to be true .
*/
wake_up ( & conf - > wait_barrier ) ;
md_wakeup_thread ( conf - > mddev - > thread ) ;
} else {
if ( test_bit ( R1BIO_WriteError , & r1_bio - > state ) )
@ -2411,7 +2526,8 @@ read_more:
const unsigned long do_sync
= r1_bio - > master_bio - > bi_opf & REQ_SYNC ;
r1_bio - > read_disk = disk ;
bio = bio_clone_mddev ( r1_bio - > master_bio , GFP_NOIO , mddev ) ;
bio = bio_clone_fast ( r1_bio - > master_bio , GFP_NOIO ,
mddev - > bio_set ) ;
bio_trim ( bio , r1_bio - > sector - bio - > bi_iter . bi_sector ,
max_sectors ) ;
r1_bio - > bios [ r1_bio - > read_disk ] = bio ;
@ -2445,15 +2561,8 @@ read_more:
generic_make_request ( bio ) ;
bio = NULL ;
r1_bio = mempool_alloc ( conf - > r1bio_pool , GFP_NOIO ) ;
r1_bio - > master_bio = mbio ;
r1_bio - > sectors = bio_sectors ( mbio ) - sectors_handled ;
r1_bio - > state = 0 ;
r1_bio = alloc_r1bio ( mddev , mbio , sectors_handled ) ;
set_bit ( R1BIO_ReadError , & r1_bio - > state ) ;
r1_bio - > mddev = mddev ;
r1_bio - > sector = mbio - > bi_iter . bi_sector +
sectors_handled ;
goto read_more ;
} else {
@ -2472,6 +2581,7 @@ static void raid1d(struct md_thread *thread)
struct r1conf * conf = mddev - > private ;
struct list_head * head = & conf - > retry_list ;
struct blk_plug plug ;
int idx ;
md_check_recovery ( mddev ) ;
@ -2479,17 +2589,15 @@ static void raid1d(struct md_thread *thread)
! test_bit ( MD_SB_CHANGE_PENDING , & mddev - > sb_flags ) ) {
LIST_HEAD ( tmp ) ;
spin_lock_irqsave ( & conf - > device_lock , flags ) ;
if ( ! test_bit ( MD_SB_CHANGE_PENDING , & mddev - > sb_flags ) ) {
while ( ! list_empty ( & conf - > bio_end_io_list ) ) {
list_move ( conf - > bio_end_io_list . prev , & tmp ) ;
conf - > nr_queued - - ;
}
}
if ( ! test_bit ( MD_SB_CHANGE_PENDING , & mddev - > sb_flags ) )
list_splice_init ( & conf - > bio_end_io_list , & tmp ) ;
spin_unlock_irqrestore ( & conf - > device_lock , flags ) ;
while ( ! list_empty ( & tmp ) ) {
r1_bio = list_first_entry ( & tmp , struct r1bio ,
retry_list ) ;
list_del ( & r1_bio - > retry_list ) ;
idx = sector_to_idx ( r1_bio - > sector ) ;
atomic_dec ( & conf - > nr_queued [ idx ] ) ;
if ( mddev - > degraded )
set_bit ( R1BIO_Degraded , & r1_bio - > state ) ;
if ( test_bit ( R1BIO_WriteError , & r1_bio - > state ) )
@ -2510,7 +2618,8 @@ static void raid1d(struct md_thread *thread)
}
r1_bio = list_entry ( head - > prev , struct r1bio , retry_list ) ;
list_del ( head - > prev ) ;
conf - > nr_queued - - ;
idx = sector_to_idx ( r1_bio - > sector ) ;
atomic_dec ( & conf - > nr_queued [ idx ] ) ;
spin_unlock_irqrestore ( & conf - > device_lock , flags ) ;
mddev = r1_bio - > mddev ;
@ -2549,7 +2658,6 @@ static int init_resync(struct r1conf *conf)
conf - > poolinfo ) ;
if ( ! conf - > r1buf_pool )
return - ENOMEM ;
conf - > next_resync = 0 ;
return 0 ;
}
@ -2578,6 +2686,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
int still_degraded = 0 ;
int good_sectors = RESYNC_SECTORS ;
int min_bad = 0 ; /* number of sectors that are bad in all devices */
int idx = sector_to_idx ( sector_nr ) ;
if ( ! conf - > r1buf_pool )
if ( init_resync ( conf ) )
@ -2627,7 +2736,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* If there is non - resync activity waiting for a turn , then let it
* though before starting on this new sync request .
*/
if ( conf - > nr_waiting )
if ( atomic_read ( & conf - > nr_waiting [ idx ] ) )
schedule_timeout_uninterruptible ( 1 ) ;
/* we are incrementing sector_nr below. To be safe, we check against
@ -2654,6 +2763,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
r1_bio - > sector = sector_nr ;
r1_bio - > state = 0 ;
set_bit ( R1BIO_IsSync , & r1_bio - > state ) ;
/* make sure good_sectors won't go across barrier unit boundary */
good_sectors = align_to_barrier_unit_end ( sector_nr , good_sectors ) ;
for ( i = 0 ; i < conf - > raid_disks * 2 ; i + + ) {
struct md_rdev * rdev ;
@ -2884,6 +2995,26 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if ( ! conf )
goto abort ;
conf - > nr_pending = kcalloc ( BARRIER_BUCKETS_NR ,
sizeof ( atomic_t ) , GFP_KERNEL ) ;
if ( ! conf - > nr_pending )
goto abort ;
conf - > nr_waiting = kcalloc ( BARRIER_BUCKETS_NR ,
sizeof ( atomic_t ) , GFP_KERNEL ) ;
if ( ! conf - > nr_waiting )
goto abort ;
conf - > nr_queued = kcalloc ( BARRIER_BUCKETS_NR ,
sizeof ( atomic_t ) , GFP_KERNEL ) ;
if ( ! conf - > nr_queued )
goto abort ;
conf - > barrier = kcalloc ( BARRIER_BUCKETS_NR ,
sizeof ( atomic_t ) , GFP_KERNEL ) ;
if ( ! conf - > barrier )
goto abort ;
conf - > mirrors = kzalloc ( sizeof ( struct raid1_info )
* mddev - > raid_disks * 2 ,
GFP_KERNEL ) ;
@ -2939,9 +3070,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf - > pending_count = 0 ;
conf - > recovery_disabled = mddev - > recovery_disabled - 1 ;
conf - > start_next_window = MaxSector ;
conf - > current_window_requests = conf - > next_window_requests = 0 ;
err = - EIO ;
for ( i = 0 ; i < conf - > raid_disks * 2 ; i + + ) {
@ -2984,6 +3112,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
kfree ( conf - > mirrors ) ;
safe_put_page ( conf - > tmppage ) ;
kfree ( conf - > poolinfo ) ;
kfree ( conf - > nr_pending ) ;
kfree ( conf - > nr_waiting ) ;
kfree ( conf - > nr_queued ) ;
kfree ( conf - > barrier ) ;
kfree ( conf ) ;
}
return ERR_PTR ( err ) ;
@ -3085,6 +3217,10 @@ static void raid1_free(struct mddev *mddev, void *priv)
kfree ( conf - > mirrors ) ;
safe_put_page ( conf - > tmppage ) ;
kfree ( conf - > poolinfo ) ;
kfree ( conf - > nr_pending ) ;
kfree ( conf - > nr_waiting ) ;
kfree ( conf - > nr_queued ) ;
kfree ( conf - > barrier ) ;
kfree ( conf ) ;
}