@ -451,16 +451,37 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
__wake_up ( wq , TASK_NORMAL , wake_all ? 0 : 1 , & key ) ;
}
static int __dax_invalidate_mapping_entry ( struct address_space * mapping ,
pgoff_t index , bool trunc )
{
int ret = 0 ;
void * entry ;
struct radix_tree_root * page_tree = & mapping - > page_tree ;
spin_lock_irq ( & mapping - > tree_lock ) ;
entry = get_unlocked_mapping_entry ( mapping , index , NULL ) ;
if ( ! entry | | ! radix_tree_exceptional_entry ( entry ) )
goto out ;
if ( ! trunc & &
( radix_tree_tag_get ( page_tree , index , PAGECACHE_TAG_DIRTY ) | |
radix_tree_tag_get ( page_tree , index , PAGECACHE_TAG_TOWRITE ) ) )
goto out ;
radix_tree_delete ( page_tree , index ) ;
mapping - > nrexceptional - - ;
ret = 1 ;
out :
put_unlocked_mapping_entry ( mapping , index , entry ) ;
spin_unlock_irq ( & mapping - > tree_lock ) ;
return ret ;
}
/*
* Delete exceptional DAX entry at @ index from @ mapping . Wait for radix tree
* entry to get unlocked before deleting it .
*/
int dax_delete_mapping_entry ( struct address_space * mapping , pgoff_t index )
{
void * entry ;
int ret = __dax_invalidate_mapping_entry ( mapping , index , true ) ;
spin_lock_irq ( & mapping - > tree_lock ) ;
entry = get_unlocked_mapping_entry ( mapping , index , NULL ) ;
/*
* This gets called from truncate / punch_hole path . As such , the caller
* must hold locks protecting against concurrent modifications of the
@ -468,16 +489,46 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
* caller has seen exceptional entry for this index , we better find it
* at that index as well . . .
*/
if ( WARN_ON_ONCE ( ! entry | | ! radix_tree_exceptional_entry ( entry ) ) ) {
spin_unlock_irq ( & mapping - > tree_lock ) ;
return 0 ;
}
radix_tree_delete ( & mapping - > page_tree , index ) ;
WARN_ON_ONCE ( ! ret ) ;
return ret ;
}
/*
* Invalidate exceptional DAX entry if easily possible . This handles DAX
* entries for invalidate_inode_pages ( ) so we evict the entry only if we can
* do so without blocking .
*/
int dax_invalidate_mapping_entry ( struct address_space * mapping , pgoff_t index )
{
int ret = 0 ;
void * entry , * * slot ;
struct radix_tree_root * page_tree = & mapping - > page_tree ;
spin_lock_irq ( & mapping - > tree_lock ) ;
entry = __radix_tree_lookup ( page_tree , index , NULL , & slot ) ;
if ( ! entry | | ! radix_tree_exceptional_entry ( entry ) | |
slot_locked ( mapping , slot ) )
goto out ;
if ( radix_tree_tag_get ( page_tree , index , PAGECACHE_TAG_DIRTY ) | |
radix_tree_tag_get ( page_tree , index , PAGECACHE_TAG_TOWRITE ) )
goto out ;
radix_tree_delete ( page_tree , index ) ;
mapping - > nrexceptional - - ;
ret = 1 ;
out :
spin_unlock_irq ( & mapping - > tree_lock ) ;
dax_wake_mapping_entry_waiter ( mapping , index , entry , true ) ;
if ( ret )
dax_wake_mapping_entry_waiter ( mapping , index , entry , true ) ;
return ret ;
}
return 1 ;
/*
* Invalidate exceptional DAX entry if it is clean .
*/
int dax_invalidate_mapping_entry_sync ( struct address_space * mapping ,
pgoff_t index )
{
return __dax_invalidate_mapping_entry ( mapping , index , false ) ;
}
/*
@ -488,15 +539,16 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
* otherwise it will simply fall out of the page cache under memory
* pressure without ever having been dirtied .
*/
static int dax_load_hole ( struct address_space * mapping , void * entry ,
static int dax_load_hole ( struct address_space * mapping , void * * entry ,
struct vm_fault * vmf )
{
struct page * page ;
int ret ;
/* Hole page already exists? Return it... */
if ( ! radix_tree_exceptional_entry ( entry ) ) {
vmf - > page = entry ;
return VM_FAULT_LOCKED ;
if ( ! radix_tree_exceptional_entry ( * entry ) ) {
page = * entry ;
goto out ;
}
/* This will replace locked radix tree entry with a hole page */
@ -504,8 +556,17 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
vmf - > gfp_mask | __GFP_ZERO ) ;
if ( ! page )
return VM_FAULT_OOM ;
out :
vmf - > page = page ;
return VM_FAULT_LOCKED ;
ret = finish_fault ( vmf ) ;
vmf - > page = NULL ;
* entry = page ;
if ( ! ret ) {
/* Grab reference for PTE that is now referencing the page */
get_page ( page ) ;
return VM_FAULT_NOPAGE ;
}
return ret ;
}
static int copy_user_dax ( struct block_device * bdev , sector_t sector , size_t size ,
@ -934,6 +995,17 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if ( WARN_ON_ONCE ( iomap - > type ! = IOMAP_MAPPED ) )
return - EIO ;
/*
* Write can allocate block for an area which has a hole page mapped
* into page tables . We have to tear down these mappings so that data
* written by write ( 2 ) is visible in mmap .
*/
if ( ( iomap - > flags & IOMAP_F_NEW ) & & inode - > i_mapping - > nrpages ) {
invalidate_inode_pages2_range ( inode - > i_mapping ,
pos > > PAGE_SHIFT ,
( end - 1 ) > > PAGE_SHIFT ) ;
}
while ( pos < end ) {
unsigned offset = pos & ( PAGE_SIZE - 1 ) ;
struct blk_dax_ctl dax = { 0 } ;
@ -992,23 +1064,6 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
if ( iov_iter_rw ( iter ) = = WRITE )
flags | = IOMAP_WRITE ;
/*
* Yes , even DAX files can have page cache attached to them : A zeroed
* page is inserted into the pagecache when we have to serve a write
* fault on a hole . It should never be dirtied and can simply be
* dropped from the pagecache once we get real data for the page .
*
* XXX : This is racy against mmap , and there ' s nothing we can do about
* it . We ' ll eventually need to shift this down even further so that
* we can check if we allocated blocks over a hole first .
*/
if ( mapping - > nrpages ) {
ret = invalidate_inode_pages2_range ( mapping ,
pos > > PAGE_SHIFT ,
( pos + iov_iter_count ( iter ) - 1 ) > > PAGE_SHIFT ) ;
WARN_ON_ONCE ( ret ) ;
}
while ( iov_iter_count ( iter ) ) {
ret = iomap_apply ( inode , pos , iov_iter_count ( iter ) , flags , ops ,
iter , dax_iomap_actor ) ;
@ -1023,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
}
EXPORT_SYMBOL_GPL ( dax_iomap_rw ) ;
static int dax_fault_return ( int error )
{
if ( error = = 0 )
return VM_FAULT_NOPAGE ;
if ( error = = - ENOMEM )
return VM_FAULT_OOM ;
return VM_FAULT_SIGBUS ;
}
/**
* dax_iomap_fault - handle a page fault on a DAX file
* @ vma : The virtual memory area where the fault occurred
@ -1055,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if ( pos > = i_size_read ( inode ) )
return VM_FAULT_SIGBUS ;
entry = grab_mapping_entry ( mapping , vmf - > pgoff , 0 ) ;
if ( IS_ERR ( entry ) ) {
error = PTR_ERR ( entry ) ;
goto out ;
}
if ( ( vmf - > flags & FAULT_FLAG_WRITE ) & & ! vmf - > cow_page )
flags | = IOMAP_WRITE ;
@ -1071,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
*/
error = ops - > iomap_begin ( inode , pos , PAGE_SIZE , flags , & iomap ) ;
if ( error )
goto unlock_entry ;
return dax_fault_return ( error ) ;
if ( WARN_ON_ONCE ( iomap . offset + iomap . length < pos + PAGE_SIZE ) ) {
error = - EIO ; /* fs corruption? */
vmf_ret = dax_fault_return ( - EIO ) ; /* fs corruption? */
goto finish_iomap ;
}
entry = grab_mapping_entry ( mapping , vmf - > pgoff , 0 ) ;
if ( IS_ERR ( entry ) ) {
vmf_ret = dax_fault_return ( PTR_ERR ( entry ) ) ;
goto finish_iomap ;
}
@ -1096,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
if ( error )
goto finish_iomap ;
goto error_unlock_entry ;
__SetPageUptodate ( vmf - > cow_page ) ;
vmf_ret = finish_fault ( vmf ) ;
if ( ! vmf_ret )
vmf_ret = VM_FAULT_DONE_COW ;
goto finish_iomap ;
goto unlock_entry ;
}
switch ( iomap . type ) {
@ -1114,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
error = dax_insert_mapping ( mapping , iomap . bdev , sector ,
PAGE_SIZE , & entry , vma , vmf ) ;
/* -EBUSY is fine, somebody else faulted on the same PTE */
if ( error = = - EBUSY )
error = 0 ;
break ;
case IOMAP_UNWRITTEN :
case IOMAP_HOLE :
if ( ! ( vmf - > flags & FAULT_FLAG_WRITE ) ) {
vmf_ret = dax_load_hole ( mapping , entry , vmf ) ;
break ;
vmf_ret = dax_load_hole ( mapping , & entry , vmf ) ;
goto unlock_entry ;
}
/*FALLTHRU*/
default :
@ -1128,31 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
break ;
}
error_unlock_entry :
vmf_ret = dax_fault_return ( error ) | major ;
unlock_entry :
put_locked_mapping_entry ( mapping , vmf - > pgoff , entry ) ;
finish_iomap :
if ( ops - > iomap_end ) {
if ( error | | ( vmf_ret & VM_FAULT_ERROR ) ) {
/* keep previous error */
ops - > iomap_end ( inode , pos , PAGE_SIZE , 0 , flags ,
& iomap ) ;
} else {
error = ops - > iomap_end ( inode , pos , PAGE_SIZE ,
PAGE_SIZE , flags , & iomap ) ;
}
}
unlock_entry :
if ( vmf_ret ! = VM_FAULT_LOCKED | | error )
put_locked_mapping_entry ( mapping , vmf - > pgoff , entry ) ;
out :
if ( error = = - ENOMEM )
return VM_FAULT_OOM | major ;
/* -EBUSY is fine, somebody else faulted on the same PTE */
if ( error < 0 & & error ! = - EBUSY )
return VM_FAULT_SIGBUS | major ;
if ( vmf_ret ) {
WARN_ON_ONCE ( error ) ; /* -EBUSY from ops->iomap_end? */
return vmf_ret ;
int copied = PAGE_SIZE ;
if ( vmf_ret & VM_FAULT_ERROR )
copied = 0 ;
/*
* The fault is done by now and there ' s no way back ( other
* thread may be already happily using PTE we have installed ) .
* Just ignore error from - > iomap_end since we cannot do much
* with it .
*/
ops - > iomap_end ( inode , pos , PAGE_SIZE , copied , flags , & iomap ) ;
}
return VM_FAULT_NOPAGE | major ;
return vmf_ret ;
}
EXPORT_SYMBOL_GPL ( dax_iomap_fault ) ;
@ -1276,16 +1337,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if ( ( pgoff | PG_PMD_COLOUR ) > max_pgoff )
goto fallback ;
/*
* grab_mapping_entry ( ) will make sure we get a 2 M empty entry , a DAX
* PMD or a HZP entry . If it can ' t ( because a 4 k page is already in
* the tree , for instance ) , it will return - EEXIST and we just fall
* back to 4 k entries .
*/
entry = grab_mapping_entry ( mapping , pgoff , RADIX_DAX_PMD ) ;
if ( IS_ERR ( entry ) )
goto fallback ;
/*
* Note that we don ' t use iomap_apply here . We aren ' t doing I / O , only
* setting up a mapping , so really we ' re using iomap_begin ( ) as a way
@ -1294,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
pos = ( loff_t ) pgoff < < PAGE_SHIFT ;
error = ops - > iomap_begin ( inode , pos , PMD_SIZE , iomap_flags , & iomap ) ;
if ( error )
goto unlock_entry ;
goto fallback ;
if ( iomap . offset + iomap . length < pos + PMD_SIZE )
goto finish_iomap ;
/*
* grab_mapping_entry ( ) will make sure we get a 2 M empty entry , a DAX
* PMD or a HZP entry . If it can ' t ( because a 4 k page is already in
* the tree , for instance ) , it will return - EEXIST and we just fall
* back to 4 k entries .
*/
entry = grab_mapping_entry ( mapping , pgoff , RADIX_DAX_PMD ) ;
if ( IS_ERR ( entry ) )
goto finish_iomap ;
vmf . pgoff = pgoff ;
vmf . flags = flags ;
vmf . gfp_mask = mapping_gfp_mask ( mapping ) | __GFP_IO ;
@ -1310,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
case IOMAP_UNWRITTEN :
case IOMAP_HOLE :
if ( WARN_ON_ONCE ( write ) )
goto finish_iomap ;
goto unlock_entry ;
result = dax_pmd_load_hole ( vma , pmd , & vmf , address , & iomap ,
& entry ) ;
break ;
@ -1319,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
break ;
}
unlock_entry :
put_locked_mapping_entry ( mapping , pgoff , entry ) ;
finish_iomap :
if ( ops - > iomap_end ) {
if ( result = = VM_FAULT_FALLBACK ) {
ops - > iomap_end ( inode , pos , PMD_SIZE , 0 , iomap_flags ,
& iomap ) ;
} else {
error = ops - > iomap_end ( inode , pos , PMD_SIZE , PMD_SIZE ,
iomap_flags , & iomap ) ;
if ( error )
result = VM_FAULT_FALLBACK ;
}
int copied = PMD_SIZE ;
if ( result = = VM_FAULT_FALLBACK )
copied = 0 ;
/*
* The fault is done by now and there ' s no way back ( other
* thread may be already happily using PMD we have installed ) .
* Just ignore error from - > iomap_end since we cannot do much
* with it .
*/
ops - > iomap_end ( inode , pos , PMD_SIZE , copied , iomap_flags ,
& iomap ) ;
}
unlock_entry :
put_locked_mapping_entry ( mapping , pgoff , entry ) ;
fallback :
if ( result = = VM_FAULT_FALLBACK ) {
split_huge_pmd ( vma , pmd , address ) ;