Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
L
linux-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Commits
Open sidebar
Elphel
linux-elphel
Commits
f1301897
Commit
f1301897
authored
Jun 09, 2023
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added more files for debugging
parent
296d54dd
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
5913 additions
and
26 deletions
+5913
-26
bio.c
src/block/bio.c
+2098
-0
loop.c
src/drivers/block/loop.c
+2168
-0
block_dev.c
src/fs/block_dev.c
+16
-12
read_write.c
src/fs/read_write.c
+13
-4
iov_iter.c
src/lib/iov_iter.c
+1600
-0
filemap.c
src/mm/filemap.c
+18
-10
No files found.
src/block/bio.c
0 → 100644
View file @
f1301897
/*
* Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public Licens
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*
*/
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
#include <linux/iocontext.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
#include "blk.h"
#include "blk-rq-qos.h"
/*
* Test patch to inline a certain number of bi_io_vec's inside the bio
* itself, to shrink a bio data allocation from two mempool calls to one
*/
#define BIO_INLINE_VECS 4
/*
* if you change this list, also change bvec_alloc or things will
* break badly! cannot be bigger than what you can fit into an
* unsigned short
*/
#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n }
static
struct
biovec_slab
bvec_slabs
[
BVEC_POOL_NR
]
__read_mostly
=
{
BV
(
1
,
1
),
BV
(
4
,
4
),
BV
(
16
,
16
),
BV
(
64
,
64
),
BV
(
128
,
128
),
BV
(
BIO_MAX_PAGES
,
max
),
};
#undef BV
/*
* fs_bio_set is the bio_set containing bio and iovec memory pools used by
* IO code that does not need private memory pools.
*/
struct
bio_set
fs_bio_set
;
EXPORT_SYMBOL
(
fs_bio_set
);
/*
* Our slab pool management
*/
struct
bio_slab
{
struct
kmem_cache
*
slab
;
unsigned
int
slab_ref
;
unsigned
int
slab_size
;
char
name
[
8
];
};
static
DEFINE_MUTEX
(
bio_slab_lock
);
static
struct
bio_slab
*
bio_slabs
;
static
unsigned
int
bio_slab_nr
,
bio_slab_max
;
static
struct
kmem_cache
*
bio_find_or_create_slab
(
unsigned
int
extra_size
)
{
unsigned
int
sz
=
sizeof
(
struct
bio
)
+
extra_size
;
struct
kmem_cache
*
slab
=
NULL
;
struct
bio_slab
*
bslab
,
*
new_bio_slabs
;
unsigned
int
new_bio_slab_max
;
unsigned
int
i
,
entry
=
-
1
;
mutex_lock
(
&
bio_slab_lock
);
i
=
0
;
while
(
i
<
bio_slab_nr
)
{
bslab
=
&
bio_slabs
[
i
];
if
(
!
bslab
->
slab
&&
entry
==
-
1
)
entry
=
i
;
else
if
(
bslab
->
slab_size
==
sz
)
{
slab
=
bslab
->
slab
;
bslab
->
slab_ref
++
;
break
;
}
i
++
;
}
if
(
slab
)
goto
out_unlock
;
if
(
bio_slab_nr
==
bio_slab_max
&&
entry
==
-
1
)
{
new_bio_slab_max
=
bio_slab_max
<<
1
;
new_bio_slabs
=
krealloc
(
bio_slabs
,
new_bio_slab_max
*
sizeof
(
struct
bio_slab
),
GFP_KERNEL
);
if
(
!
new_bio_slabs
)
goto
out_unlock
;
bio_slab_max
=
new_bio_slab_max
;
bio_slabs
=
new_bio_slabs
;
}
if
(
entry
==
-
1
)
entry
=
bio_slab_nr
++
;
bslab
=
&
bio_slabs
[
entry
];
snprintf
(
bslab
->
name
,
sizeof
(
bslab
->
name
),
"bio-%d"
,
entry
);
slab
=
kmem_cache_create
(
bslab
->
name
,
sz
,
ARCH_KMALLOC_MINALIGN
,
SLAB_HWCACHE_ALIGN
,
NULL
);
if
(
!
slab
)
goto
out_unlock
;
bslab
->
slab
=
slab
;
bslab
->
slab_ref
=
1
;
bslab
->
slab_size
=
sz
;
out_unlock:
mutex_unlock
(
&
bio_slab_lock
);
return
slab
;
}
static
void
bio_put_slab
(
struct
bio_set
*
bs
)
{
struct
bio_slab
*
bslab
=
NULL
;
unsigned
int
i
;
mutex_lock
(
&
bio_slab_lock
);
for
(
i
=
0
;
i
<
bio_slab_nr
;
i
++
)
{
if
(
bs
->
bio_slab
==
bio_slabs
[
i
].
slab
)
{
bslab
=
&
bio_slabs
[
i
];
break
;
}
}
if
(
WARN
(
!
bslab
,
KERN_ERR
"bio: unable to find slab!
\n
"
))
goto
out
;
WARN_ON
(
!
bslab
->
slab_ref
);
if
(
--
bslab
->
slab_ref
)
goto
out
;
kmem_cache_destroy
(
bslab
->
slab
);
bslab
->
slab
=
NULL
;
out:
mutex_unlock
(
&
bio_slab_lock
);
}
unsigned
int
bvec_nr_vecs
(
unsigned
short
idx
)
{
return
bvec_slabs
[
--
idx
].
nr_vecs
;
}
void
bvec_free
(
mempool_t
*
pool
,
struct
bio_vec
*
bv
,
unsigned
int
idx
)
{
if
(
!
idx
)
return
;
idx
--
;
BIO_BUG_ON
(
idx
>=
BVEC_POOL_NR
);
if
(
idx
==
BVEC_POOL_MAX
)
{
mempool_free
(
bv
,
pool
);
}
else
{
struct
biovec_slab
*
bvs
=
bvec_slabs
+
idx
;
kmem_cache_free
(
bvs
->
slab
,
bv
);
}
}
struct
bio_vec
*
bvec_alloc
(
gfp_t
gfp_mask
,
int
nr
,
unsigned
long
*
idx
,
mempool_t
*
pool
)
{
struct
bio_vec
*
bvl
;
/*
* see comment near bvec_array define!
*/
switch
(
nr
)
{
case
1
:
*
idx
=
0
;
break
;
case
2
...
4
:
*
idx
=
1
;
break
;
case
5
...
16
:
*
idx
=
2
;
break
;
case
17
...
64
:
*
idx
=
3
;
break
;
case
65
...
128
:
*
idx
=
4
;
break
;
case
129
...
BIO_MAX_PAGES
:
*
idx
=
5
;
break
;
default:
return
NULL
;
}
/*
* idx now points to the pool we want to allocate from. only the
* 1-vec entry pool is mempool backed.
*/
if
(
*
idx
==
BVEC_POOL_MAX
)
{
fallback:
bvl
=
mempool_alloc
(
pool
,
gfp_mask
);
}
else
{
struct
biovec_slab
*
bvs
=
bvec_slabs
+
*
idx
;
gfp_t
__gfp_mask
=
gfp_mask
&
~
(
__GFP_DIRECT_RECLAIM
|
__GFP_IO
);
/*
* Make this allocation restricted and don't dump info on
* allocation failures, since we'll fallback to the mempool
* in case of failure.
*/
__gfp_mask
|=
__GFP_NOMEMALLOC
|
__GFP_NORETRY
|
__GFP_NOWARN
;
/*
* Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
* is set, retry with the 1-entry mempool
*/
bvl
=
kmem_cache_alloc
(
bvs
->
slab
,
__gfp_mask
);
if
(
unlikely
(
!
bvl
&&
(
gfp_mask
&
__GFP_DIRECT_RECLAIM
)))
{
*
idx
=
BVEC_POOL_MAX
;
goto
fallback
;
}
}
(
*
idx
)
++
;
return
bvl
;
}
void
bio_uninit
(
struct
bio
*
bio
)
{
bio_disassociate_task
(
bio
);
}
EXPORT_SYMBOL
(
bio_uninit
);
static
void
bio_free
(
struct
bio
*
bio
)
{
struct
bio_set
*
bs
=
bio
->
bi_pool
;
void
*
p
;
bio_uninit
(
bio
);
if
(
bs
)
{
bvec_free
(
&
bs
->
bvec_pool
,
bio
->
bi_io_vec
,
BVEC_POOL_IDX
(
bio
));
/*
* If we have front padding, adjust the bio pointer before freeing
*/
p
=
bio
;
p
-=
bs
->
front_pad
;
mempool_free
(
p
,
&
bs
->
bio_pool
);
}
else
{
/* Bio was allocated by bio_kmalloc() */
kfree
(
bio
);
}
}
/*
* Users of this function have their own bio allocation. Subsequently,
* they must remember to pair any call to bio_init() with bio_uninit()
* when IO has completed, or when the bio is released.
*/
void
bio_init
(
struct
bio
*
bio
,
struct
bio_vec
*
table
,
unsigned
short
max_vecs
)
{
memset
(
bio
,
0
,
sizeof
(
*
bio
));
atomic_set
(
&
bio
->
__bi_remaining
,
1
);
atomic_set
(
&
bio
->
__bi_cnt
,
1
);
bio
->
bi_io_vec
=
table
;
bio
->
bi_max_vecs
=
max_vecs
;
}
EXPORT_SYMBOL
(
bio_init
);
/**
* bio_reset - reinitialize a bio
* @bio: bio to reset
*
* Description:
* After calling bio_reset(), @bio will be in the same state as a freshly
* allocated bio returned bio bio_alloc_bioset() - the only fields that are
* preserved are the ones that are initialized by bio_alloc_bioset(). See
* comment in struct bio.
*/
void
bio_reset
(
struct
bio
*
bio
)
{
unsigned
long
flags
=
bio
->
bi_flags
&
(
~
0UL
<<
BIO_RESET_BITS
);
bio_uninit
(
bio
);
memset
(
bio
,
0
,
BIO_RESET_BYTES
);
bio
->
bi_flags
=
flags
;
atomic_set
(
&
bio
->
__bi_remaining
,
1
);
}
EXPORT_SYMBOL
(
bio_reset
);
static
struct
bio
*
__bio_chain_endio
(
struct
bio
*
bio
)
{
struct
bio
*
parent
=
bio
->
bi_private
;
if
(
!
parent
->
bi_status
)
parent
->
bi_status
=
bio
->
bi_status
;
bio_put
(
bio
);
return
parent
;
}
static
void
bio_chain_endio
(
struct
bio
*
bio
)
{
bio_endio
(
__bio_chain_endio
(
bio
));
}
/**
* bio_chain - chain bio completions
* @bio: the target bio
* @parent: the @bio's parent bio
*
* The caller won't have a bi_end_io called when @bio completes - instead,
* @parent's bi_end_io won't be called until both @parent and @bio have
* completed; the chained bio will also be freed when it completes.
*
* The caller must not set bi_private or bi_end_io in @bio.
*/
void
bio_chain
(
struct
bio
*
bio
,
struct
bio
*
parent
)
{
BUG_ON
(
bio
->
bi_private
||
bio
->
bi_end_io
);
bio
->
bi_private
=
parent
;
bio
->
bi_end_io
=
bio_chain_endio
;
bio_inc_remaining
(
parent
);
}
EXPORT_SYMBOL
(
bio_chain
);
static
void
bio_alloc_rescue
(
struct
work_struct
*
work
)
{
struct
bio_set
*
bs
=
container_of
(
work
,
struct
bio_set
,
rescue_work
);
struct
bio
*
bio
;
while
(
1
)
{
spin_lock
(
&
bs
->
rescue_lock
);
bio
=
bio_list_pop
(
&
bs
->
rescue_list
);
spin_unlock
(
&
bs
->
rescue_lock
);
if
(
!
bio
)
break
;
generic_make_request
(
bio
);
}
}
static
void
punt_bios_to_rescuer
(
struct
bio_set
*
bs
)
{
struct
bio_list
punt
,
nopunt
;
struct
bio
*
bio
;
if
(
WARN_ON_ONCE
(
!
bs
->
rescue_workqueue
))
return
;
/*
* In order to guarantee forward progress we must punt only bios that
* were allocated from this bio_set; otherwise, if there was a bio on
* there for a stacking driver higher up in the stack, processing it
* could require allocating bios from this bio_set, and doing that from
* our own rescuer would be bad.
*
* Since bio lists are singly linked, pop them all instead of trying to
* remove from the middle of the list:
*/
bio_list_init
(
&
punt
);
bio_list_init
(
&
nopunt
);
while
((
bio
=
bio_list_pop
(
&
current
->
bio_list
[
0
])))
bio_list_add
(
bio
->
bi_pool
==
bs
?
&
punt
:
&
nopunt
,
bio
);
current
->
bio_list
[
0
]
=
nopunt
;
bio_list_init
(
&
nopunt
);
while
((
bio
=
bio_list_pop
(
&
current
->
bio_list
[
1
])))
bio_list_add
(
bio
->
bi_pool
==
bs
?
&
punt
:
&
nopunt
,
bio
);
current
->
bio_list
[
1
]
=
nopunt
;
spin_lock
(
&
bs
->
rescue_lock
);
bio_list_merge
(
&
bs
->
rescue_list
,
&
punt
);
spin_unlock
(
&
bs
->
rescue_lock
);
queue_work
(
bs
->
rescue_workqueue
,
&
bs
->
rescue_work
);
}
/**
* bio_alloc_bioset - allocate a bio for I/O
* @gfp_mask: the GFP_* mask given to the slab allocator
* @nr_iovecs: number of iovecs to pre-allocate
* @bs: the bio_set to allocate from.
*
* Description:
* If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
* backed by the @bs's mempool.
*
* When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will
* always be able to allocate a bio. This is due to the mempool guarantees.
* To make this work, callers must never allocate more than 1 bio at a time
* from this pool. Callers that need to allocate more than 1 bio must always
* submit the previously allocated bio for IO before attempting to allocate
* a new one. Failure to do so can cause deadlocks under memory pressure.
*
* Note that when running under generic_make_request() (i.e. any block
* driver), bios are not submitted until after you return - see the code in
* generic_make_request() that converts recursion into iteration, to prevent
* stack overflows.
*
* This would normally mean allocating multiple bios under
* generic_make_request() would be susceptible to deadlocks, but we have
* deadlock avoidance code that resubmits any blocked bios from a rescuer
* thread.
*
* However, we do not guarantee forward progress for allocations from other
* mempools. Doing multiple allocations from the same mempool under
* generic_make_request() should be avoided - instead, use bio_set's front_pad
* for per bio allocations.
*
* RETURNS:
* Pointer to new bio on success, NULL on failure.
*/
struct
bio
*
bio_alloc_bioset
(
gfp_t
gfp_mask
,
unsigned
int
nr_iovecs
,
struct
bio_set
*
bs
)
{
gfp_t
saved_gfp
=
gfp_mask
;
unsigned
front_pad
;
unsigned
inline_vecs
;
struct
bio_vec
*
bvl
=
NULL
;
struct
bio
*
bio
;
void
*
p
;
if
(
!
bs
)
{
if
(
nr_iovecs
>
UIO_MAXIOV
)
return
NULL
;
p
=
kmalloc
(
sizeof
(
struct
bio
)
+
nr_iovecs
*
sizeof
(
struct
bio_vec
),
gfp_mask
);
front_pad
=
0
;
inline_vecs
=
nr_iovecs
;
}
else
{
/* should not use nobvec bioset for nr_iovecs > 0 */
if
(
WARN_ON_ONCE
(
!
mempool_initialized
(
&
bs
->
bvec_pool
)
&&
nr_iovecs
>
0
))
return
NULL
;
/*
* generic_make_request() converts recursion to iteration; this
* means if we're running beneath it, any bios we allocate and
* submit will not be submitted (and thus freed) until after we
* return.
*
* This exposes us to a potential deadlock if we allocate
* multiple bios from the same bio_set() while running
* underneath generic_make_request(). If we were to allocate
* multiple bios (say a stacking block driver that was splitting
* bios), we would deadlock if we exhausted the mempool's
* reserve.
*
* We solve this, and guarantee forward progress, with a rescuer
* workqueue per bio_set. If we go to allocate and there are
* bios on current->bio_list, we first try the allocation
* without __GFP_DIRECT_RECLAIM; if that fails, we punt those
* bios we would be blocking to the rescuer workqueue before
* we retry with the original gfp_flags.
*/
if
(
current
->
bio_list
&&
(
!
bio_list_empty
(
&
current
->
bio_list
[
0
])
||
!
bio_list_empty
(
&
current
->
bio_list
[
1
]))
&&
bs
->
rescue_workqueue
)
gfp_mask
&=
~
__GFP_DIRECT_RECLAIM
;
p
=
mempool_alloc
(
&
bs
->
bio_pool
,
gfp_mask
);
if
(
!
p
&&
gfp_mask
!=
saved_gfp
)
{
punt_bios_to_rescuer
(
bs
);
gfp_mask
=
saved_gfp
;
p
=
mempool_alloc
(
&
bs
->
bio_pool
,
gfp_mask
);
}
front_pad
=
bs
->
front_pad
;
inline_vecs
=
BIO_INLINE_VECS
;
}
if
(
unlikely
(
!
p
))
return
NULL
;
bio
=
p
+
front_pad
;
bio_init
(
bio
,
NULL
,
0
);
if
(
nr_iovecs
>
inline_vecs
)
{
unsigned
long
idx
=
0
;
bvl
=
bvec_alloc
(
gfp_mask
,
nr_iovecs
,
&
idx
,
&
bs
->
bvec_pool
);
if
(
!
bvl
&&
gfp_mask
!=
saved_gfp
)
{
punt_bios_to_rescuer
(
bs
);
gfp_mask
=
saved_gfp
;
bvl
=
bvec_alloc
(
gfp_mask
,
nr_iovecs
,
&
idx
,
&
bs
->
bvec_pool
);
}
if
(
unlikely
(
!
bvl
))
goto
err_free
;
bio
->
bi_flags
|=
idx
<<
BVEC_POOL_OFFSET
;
}
else
if
(
nr_iovecs
)
{
bvl
=
bio
->
bi_inline_vecs
;
}
bio
->
bi_pool
=
bs
;
bio
->
bi_max_vecs
=
nr_iovecs
;
bio
->
bi_io_vec
=
bvl
;
return
bio
;
err_free:
mempool_free
(
p
,
&
bs
->
bio_pool
);
return
NULL
;
}
EXPORT_SYMBOL
(
bio_alloc_bioset
);
void
zero_fill_bio_iter
(
struct
bio
*
bio
,
struct
bvec_iter
start
)
{
unsigned
long
flags
;
struct
bio_vec
bv
;
struct
bvec_iter
iter
;
__bio_for_each_segment
(
bv
,
bio
,
iter
,
start
)
{
char
*
data
=
bvec_kmap_irq
(
&
bv
,
&
flags
);
memset
(
data
,
0
,
bv
.
bv_len
);
flush_dcache_page
(
bv
.
bv_page
);
bvec_kunmap_irq
(
data
,
&
flags
);
}
}
EXPORT_SYMBOL
(
zero_fill_bio_iter
);
/**
* bio_put - release a reference to a bio
* @bio: bio to release reference to
*
* Description:
* Put a reference to a &struct bio, either one you have gotten with
* bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it.
**/
void
bio_put
(
struct
bio
*
bio
)
{
if
(
!
bio_flagged
(
bio
,
BIO_REFFED
))
bio_free
(
bio
);
else
{
BIO_BUG_ON
(
!
atomic_read
(
&
bio
->
__bi_cnt
));
/*
* last put frees it
*/
if
(
atomic_dec_and_test
(
&
bio
->
__bi_cnt
))
bio_free
(
bio
);
}
}
EXPORT_SYMBOL
(
bio_put
);
inline
int
bio_phys_segments
(
struct
request_queue
*
q
,
struct
bio
*
bio
)
{
if
(
unlikely
(
!
bio_flagged
(
bio
,
BIO_SEG_VALID
)))
blk_recount_segments
(
q
,
bio
);
return
bio
->
bi_phys_segments
;
}
EXPORT_SYMBOL
(
bio_phys_segments
);
/**
* __bio_clone_fast - clone a bio that shares the original bio's biovec
* @bio: destination bio
* @bio_src: bio to clone
*
* Clone a &bio. Caller will own the returned bio, but not
* the actual data it points to. Reference count of returned
* bio will be one.
*
* Caller must ensure that @bio_src is not freed before @bio.
*/
void
__bio_clone_fast
(
struct
bio
*
bio
,
struct
bio
*
bio_src
)
{
BUG_ON
(
bio
->
bi_pool
&&
BVEC_POOL_IDX
(
bio
));
/*
* most users will be overriding ->bi_disk with a new target,
* so we don't set nor calculate new physical/hw segment counts here
*/
bio
->
bi_disk
=
bio_src
->
bi_disk
;
bio
->
bi_partno
=
bio_src
->
bi_partno
;
bio_set_flag
(
bio
,
BIO_CLONED
);
if
(
bio_flagged
(
bio_src
,
BIO_THROTTLED
))
bio_set_flag
(
bio
,
BIO_THROTTLED
);
bio
->
bi_opf
=
bio_src
->
bi_opf
;
bio
->
bi_write_hint
=
bio_src
->
bi_write_hint
;
bio
->
bi_iter
=
bio_src
->
bi_iter
;
bio
->
bi_io_vec
=
bio_src
->
bi_io_vec
;
bio_clone_blkcg_association
(
bio
,
bio_src
);
}
EXPORT_SYMBOL
(
__bio_clone_fast
);
/**
* bio_clone_fast - clone a bio that shares the original bio's biovec
* @bio: bio to clone
* @gfp_mask: allocation priority
* @bs: bio_set to allocate from
*
* Like __bio_clone_fast, only also allocates the returned bio
*/
struct
bio
*
bio_clone_fast
(
struct
bio
*
bio
,
gfp_t
gfp_mask
,
struct
bio_set
*
bs
)
{
struct
bio
*
b
;
b
=
bio_alloc_bioset
(
gfp_mask
,
0
,
bs
);
if
(
!
b
)
return
NULL
;
__bio_clone_fast
(
b
,
bio
);
if
(
bio_integrity
(
bio
))
{
int
ret
;
ret
=
bio_integrity_clone
(
b
,
bio
,
gfp_mask
);
if
(
ret
<
0
)
{
bio_put
(
b
);
return
NULL
;
}
}
return
b
;
}
EXPORT_SYMBOL
(
bio_clone_fast
);
/**
* bio_add_pc_page - attempt to add page to bio
* @q: the target queue
* @bio: destination bio
* @page: page to add
* @len: vec entry length
* @offset: vec entry offset
*
* Attempt to add a page to the bio_vec maplist. This can fail for a
* number of reasons, such as the bio being full or target block device
* limitations. The target block device must allow bio's up to PAGE_SIZE,
* so it is always possible to add a single page to an empty bio.
*
* This should only be used by REQ_PC bios.
*/
int
bio_add_pc_page
(
struct
request_queue
*
q
,
struct
bio
*
bio
,
struct
page
*
page
,
unsigned
int
len
,
unsigned
int
offset
)
{
int
retried_segments
=
0
;
struct
bio_vec
*
bvec
;
/*
* cloned bio must not modify vec list
*/
if
(
unlikely
(
bio_flagged
(
bio
,
BIO_CLONED
)))
return
0
;
if
(((
bio
->
bi_iter
.
bi_size
+
len
)
>>
9
)
>
queue_max_hw_sectors
(
q
))
return
0
;
/*
* For filesystems with a blocksize smaller than the pagesize
* we will often be called with the same page as last time and
* a consecutive offset. Optimize this special case.
*/
if
(
bio
->
bi_vcnt
>
0
)
{
struct
bio_vec
*
prev
=
&
bio
->
bi_io_vec
[
bio
->
bi_vcnt
-
1
];
if
(
page
==
prev
->
bv_page
&&
offset
==
prev
->
bv_offset
+
prev
->
bv_len
)
{
prev
->
bv_len
+=
len
;
bio
->
bi_iter
.
bi_size
+=
len
;
goto
done
;
}
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
*/
if
(
bvec_gap_to_prev
(
q
,
prev
,
offset
))
return
0
;
}
if
(
bio_full
(
bio
))
return
0
;
/*
* setup the new entry, we might clear it again later if we
* cannot add the page
*/
bvec
=
&
bio
->
bi_io_vec
[
bio
->
bi_vcnt
];
bvec
->
bv_page
=
page
;
bvec
->
bv_len
=
len
;
bvec
->
bv_offset
=
offset
;
bio
->
bi_vcnt
++
;
bio
->
bi_phys_segments
++
;
bio
->
bi_iter
.
bi_size
+=
len
;
/*
* Perform a recount if the number of segments is greater
* than queue_max_segments(q).
*/
while
(
bio
->
bi_phys_segments
>
queue_max_segments
(
q
))
{
if
(
retried_segments
)
goto
failed
;
retried_segments
=
1
;
blk_recount_segments
(
q
,
bio
);
}
/* If we may be able to merge these biovecs, force a recount */
if
(
bio
->
bi_vcnt
>
1
&&
(
BIOVEC_PHYS_MERGEABLE
(
bvec
-
1
,
bvec
)))
bio_clear_flag
(
bio
,
BIO_SEG_VALID
);
done:
return
len
;
failed:
bvec
->
bv_page
=
NULL
;
bvec
->
bv_len
=
0
;
bvec
->
bv_offset
=
0
;
bio
->
bi_vcnt
--
;
bio
->
bi_iter
.
bi_size
-=
len
;
blk_recount_segments
(
q
,
bio
);
return
0
;
}
EXPORT_SYMBOL
(
bio_add_pc_page
);
/**
* __bio_try_merge_page - try appending data to an existing bvec.
* @bio: destination bio
* @page: page to add
* @len: length of the data to add
* @off: offset of the data in @page
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
* a useful optimisation for file systems with a block size smaller than the
* page size.
*
* Return %true on success or %false on failure.
*/
bool
__bio_try_merge_page
(
struct
bio
*
bio
,
struct
page
*
page
,
unsigned
int
len
,
unsigned
int
off
)
{
if
(
WARN_ON_ONCE
(
bio_flagged
(
bio
,
BIO_CLONED
)))
return
false
;
if
(
bio
->
bi_vcnt
>
0
)
{
struct
bio_vec
*
bv
=
&
bio
->
bi_io_vec
[
bio
->
bi_vcnt
-
1
];
if
(
page
==
bv
->
bv_page
&&
off
==
bv
->
bv_offset
+
bv
->
bv_len
)
{
bv
->
bv_len
+=
len
;
bio
->
bi_iter
.
bi_size
+=
len
;
return
true
;
}
}
return
false
;
}
EXPORT_SYMBOL_GPL
(
__bio_try_merge_page
);
/**
* __bio_add_page - add page to a bio in a new segment
* @bio: destination bio
* @page: page to add
* @len: length of the data to add
* @off: offset of the data in @page
*
* Add the data at @page + @off to @bio as a new bvec. The caller must ensure
* that @bio has space for another bvec.
*/
void
__bio_add_page
(
struct
bio
*
bio
,
struct
page
*
page
,
unsigned
int
len
,
unsigned
int
off
)
{
struct
bio_vec
*
bv
=
&
bio
->
bi_io_vec
[
bio
->
bi_vcnt
];
WARN_ON_ONCE
(
bio_flagged
(
bio
,
BIO_CLONED
));
WARN_ON_ONCE
(
bio_full
(
bio
));
bv
->
bv_page
=
page
;
bv
->
bv_offset
=
off
;
bv
->
bv_len
=
len
;
bio
->
bi_iter
.
bi_size
+=
len
;
bio
->
bi_vcnt
++
;
}
EXPORT_SYMBOL_GPL
(
__bio_add_page
);
/**
* bio_add_page - attempt to add page to bio
* @bio: destination bio
* @page: page to add
* @len: vec entry length
* @offset: vec entry offset
*
* Attempt to add a page to the bio_vec maplist. This will only fail
* if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
*/
int
bio_add_page
(
struct
bio
*
bio
,
struct
page
*
page
,
unsigned
int
len
,
unsigned
int
offset
)
{
if
(
!
__bio_try_merge_page
(
bio
,
page
,
len
,
offset
))
{
if
(
bio_full
(
bio
))
return
0
;
__bio_add_page
(
bio
,
page
,
len
,
offset
);
}
return
len
;
}
EXPORT_SYMBOL
(
bio_add_page
);
/**
* __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
* @bio: bio to add pages to
* @iter: iov iterator describing the region to be mapped
*
* Pins pages from *iter and appends them to @bio's bvec array. The
* pages will have to be released using put_page() when done.
* For multi-segment *iter, this function only adds pages from the
* the next non-empty segment of the iov iterator.
*/
static
int
__bio_iov_iter_get_pages
(
struct
bio
*
bio
,
struct
iov_iter
*
iter
)
{
unsigned
short
nr_pages
=
bio
->
bi_max_vecs
-
bio
->
bi_vcnt
,
idx
;
struct
bio_vec
*
bv
=
bio
->
bi_io_vec
+
bio
->
bi_vcnt
;
struct
page
**
pages
=
(
struct
page
**
)
bv
;
size_t
offset
;
ssize_t
size
;
size
=
iov_iter_get_pages
(
iter
,
pages
,
LONG_MAX
,
nr_pages
,
&
offset
);
pr_debug
(
"size = %d"
,
size
);
if
(
unlikely
(
size
<=
0
))
{
pr_debug
(
"size = %d <0, return -EFAULT"
,
size
);
return
size
?
size
:
-
EFAULT
;
}
idx
=
nr_pages
=
(
size
+
offset
+
PAGE_SIZE
-
1
)
/
PAGE_SIZE
;
/*
* Deep magic below: We need to walk the pinned pages backwards
* because we are abusing the space allocated for the bio_vecs
* for the page array. Because the bio_vecs are larger than the
* page pointers by definition this will always work. But it also
* means we can't use bio_add_page, so any changes to it's semantics
* need to be reflected here as well.
*/
bio
->
bi_iter
.
bi_size
+=
size
;
bio
->
bi_vcnt
+=
nr_pages
;
while
(
idx
--
)
{
bv
[
idx
].
bv_page
=
pages
[
idx
];
bv
[
idx
].
bv_len
=
PAGE_SIZE
;
bv
[
idx
].
bv_offset
=
0
;
}
bv
[
0
].
bv_offset
+=
offset
;
bv
[
0
].
bv_len
-=
offset
;
bv
[
nr_pages
-
1
].
bv_len
-=
nr_pages
*
PAGE_SIZE
-
offset
-
size
;
iov_iter_advance
(
iter
,
size
);
return
0
;
}
/**
* bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
* @bio: bio to add pages to
* @iter: iov iterator describing the region to be mapped
*
* Pins pages from *iter and appends them to @bio's bvec array. The
* pages will have to be released using put_page() when done.
* The function tries, but does not guarantee, to pin as many pages as
* fit into the bio, or are requested in *iter, whatever is smaller.
* If MM encounters an error pinning the requested pages, it stops.
* Error is returned only if 0 pages could be pinned.
*/
int
bio_iov_iter_get_pages
(
struct
bio
*
bio
,
struct
iov_iter
*
iter
)
{
unsigned
short
orig_vcnt
=
bio
->
bi_vcnt
;
pr_debug
(
"ret = %d"
,
orig_vcnt
);
do
{
int
ret
=
__bio_iov_iter_get_pages
(
bio
,
iter
);
if
(
unlikely
(
ret
))
return
bio
->
bi_vcnt
>
orig_vcnt
?
0
:
ret
;
}
while
(
iov_iter_count
(
iter
)
&&
!
bio_full
(
bio
));
return
0
;
}
EXPORT_SYMBOL_GPL
(
bio_iov_iter_get_pages
);
static
void
submit_bio_wait_endio
(
struct
bio
*
bio
)
{
complete
(
bio
->
bi_private
);
}
/**
* submit_bio_wait - submit a bio, and wait until it completes
* @bio: The &struct bio which describes the I/O
*
* Simple wrapper around submit_bio(). Returns 0 on success, or the error from
* bio_endio() on failure.
*
* WARNING: Unlike to how submit_bio() is usually used, this function does not
* result in bio reference to be consumed. The caller must drop the reference
* on his own.
*/
int
submit_bio_wait
(
struct
bio
*
bio
)
{
DECLARE_COMPLETION_ONSTACK_MAP
(
done
,
bio
->
bi_disk
->
lockdep_map
);
bio
->
bi_private
=
&
done
;
bio
->
bi_end_io
=
submit_bio_wait_endio
;
bio
->
bi_opf
|=
REQ_SYNC
;
submit_bio
(
bio
);
wait_for_completion_io
(
&
done
);
return
blk_status_to_errno
(
bio
->
bi_status
);
}
EXPORT_SYMBOL
(
submit_bio_wait
);
/**
* bio_advance - increment/complete a bio by some number of bytes
* @bio: bio to advance
* @bytes: number of bytes to complete
*
* This updates bi_sector, bi_size and bi_idx; if the number of bytes to
* complete doesn't align with a bvec boundary, then bv_len and bv_offset will
* be updated on the last bvec as well.
*
* @bio will then represent the remaining, uncompleted portion of the io.
*/
void
bio_advance
(
struct
bio
*
bio
,
unsigned
bytes
)
{
if
(
bio_integrity
(
bio
))
bio_integrity_advance
(
bio
,
bytes
);
bio_advance_iter
(
bio
,
&
bio
->
bi_iter
,
bytes
);
}
EXPORT_SYMBOL
(
bio_advance
);
void
bio_copy_data_iter
(
struct
bio
*
dst
,
struct
bvec_iter
*
dst_iter
,
struct
bio
*
src
,
struct
bvec_iter
*
src_iter
)
{
struct
bio_vec
src_bv
,
dst_bv
;
void
*
src_p
,
*
dst_p
;
unsigned
bytes
;
while
(
src_iter
->
bi_size
&&
dst_iter
->
bi_size
)
{
src_bv
=
bio_iter_iovec
(
src
,
*
src_iter
);
dst_bv
=
bio_iter_iovec
(
dst
,
*
dst_iter
);
bytes
=
min
(
src_bv
.
bv_len
,
dst_bv
.
bv_len
);
src_p
=
kmap_atomic
(
src_bv
.
bv_page
);
dst_p
=
kmap_atomic
(
dst_bv
.
bv_page
);
memcpy
(
dst_p
+
dst_bv
.
bv_offset
,
src_p
+
src_bv
.
bv_offset
,
bytes
);
kunmap_atomic
(
dst_p
);
kunmap_atomic
(
src_p
);
flush_dcache_page
(
dst_bv
.
bv_page
);
bio_advance_iter
(
src
,
src_iter
,
bytes
);
bio_advance_iter
(
dst
,
dst_iter
,
bytes
);
}
}
EXPORT_SYMBOL
(
bio_copy_data_iter
);
/**
* bio_copy_data - copy contents of data buffers from one bio to another
* @src: source bio
* @dst: destination bio
*
* Stops when it reaches the end of either @src or @dst - that is, copies
* min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
*/
void
bio_copy_data
(
struct
bio
*
dst
,
struct
bio
*
src
)
{
struct
bvec_iter
src_iter
=
src
->
bi_iter
;
struct
bvec_iter
dst_iter
=
dst
->
bi_iter
;
bio_copy_data_iter
(
dst
,
&
dst_iter
,
src
,
&
src_iter
);
}
EXPORT_SYMBOL
(
bio_copy_data
);
/**
* bio_list_copy_data - copy contents of data buffers from one chain of bios to
* another
* @src: source bio list
* @dst: destination bio list
*
* Stops when it reaches the end of either the @src list or @dst list - that is,
* copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
* bios).
*/
void
bio_list_copy_data
(
struct
bio
*
dst
,
struct
bio
*
src
)
{
struct
bvec_iter
src_iter
=
src
->
bi_iter
;
struct
bvec_iter
dst_iter
=
dst
->
bi_iter
;
while
(
1
)
{
if
(
!
src_iter
.
bi_size
)
{
src
=
src
->
bi_next
;
if
(
!
src
)
break
;
src_iter
=
src
->
bi_iter
;
}
if
(
!
dst_iter
.
bi_size
)
{
dst
=
dst
->
bi_next
;
if
(
!
dst
)
break
;
dst_iter
=
dst
->
bi_iter
;
}
bio_copy_data_iter
(
dst
,
&
dst_iter
,
src
,
&
src_iter
);
}
}
EXPORT_SYMBOL
(
bio_list_copy_data
);
struct
bio_map_data
{
int
is_our_pages
;
struct
iov_iter
iter
;
struct
iovec
iov
[];
};
static
struct
bio_map_data
*
bio_alloc_map_data
(
struct
iov_iter
*
data
,
gfp_t
gfp_mask
)
{
struct
bio_map_data
*
bmd
;
if
(
data
->
nr_segs
>
UIO_MAXIOV
)
return
NULL
;
bmd
=
kmalloc
(
sizeof
(
struct
bio_map_data
)
+
sizeof
(
struct
iovec
)
*
data
->
nr_segs
,
gfp_mask
);
if
(
!
bmd
)
return
NULL
;
memcpy
(
bmd
->
iov
,
data
->
iov
,
sizeof
(
struct
iovec
)
*
data
->
nr_segs
);
bmd
->
iter
=
*
data
;
bmd
->
iter
.
iov
=
bmd
->
iov
;
return
bmd
;
}
/**
* bio_copy_from_iter - copy all pages from iov_iter to bio
* @bio: The &struct bio which describes the I/O as destination
* @iter: iov_iter as source
*
* Copy all pages from iov_iter to bio.
* Returns 0 on success, or error on failure.
*/
static
int
bio_copy_from_iter
(
struct
bio
*
bio
,
struct
iov_iter
*
iter
)
{
int
i
;
struct
bio_vec
*
bvec
;
bio_for_each_segment_all
(
bvec
,
bio
,
i
)
{
ssize_t
ret
;
ret
=
copy_page_from_iter
(
bvec
->
bv_page
,
bvec
->
bv_offset
,
bvec
->
bv_len
,
iter
);
if
(
!
iov_iter_count
(
iter
))
break
;
if
(
ret
<
bvec
->
bv_len
)
return
-
EFAULT
;
}
return
0
;
}
/**
* bio_copy_to_iter - copy all pages from bio to iov_iter
* @bio: The &struct bio which describes the I/O as source
* @iter: iov_iter as destination
*
* Copy all pages from bio to iov_iter.
* Returns 0 on success, or error on failure.
*/
static
int
bio_copy_to_iter
(
struct
bio
*
bio
,
struct
iov_iter
iter
)
{
int
i
;
struct
bio_vec
*
bvec
;
bio_for_each_segment_all
(
bvec
,
bio
,
i
)
{
ssize_t
ret
;
ret
=
copy_page_to_iter
(
bvec
->
bv_page
,
bvec
->
bv_offset
,
bvec
->
bv_len
,
&
iter
);
if
(
!
iov_iter_count
(
&
iter
))
break
;
if
(
ret
<
bvec
->
bv_len
)
return
-
EFAULT
;
}
return
0
;
}
void
bio_free_pages
(
struct
bio
*
bio
)
{
struct
bio_vec
*
bvec
;
int
i
;
bio_for_each_segment_all
(
bvec
,
bio
,
i
)
__free_page
(
bvec
->
bv_page
);
}
EXPORT_SYMBOL
(
bio_free_pages
);
/**
* bio_uncopy_user - finish previously mapped bio
* @bio: bio being terminated
*
* Free pages allocated from bio_copy_user_iov() and write back data
* to user space in case of a read.
*/
int
bio_uncopy_user
(
struct
bio
*
bio
)
{
struct
bio_map_data
*
bmd
=
bio
->
bi_private
;
int
ret
=
0
;
if
(
!
bio_flagged
(
bio
,
BIO_NULL_MAPPED
))
{
/*
* if we're in a workqueue, the request is orphaned, so
* don't copy into a random user address space, just free
* and return -EINTR so user space doesn't expect any data.
*/
if
(
!
current
->
mm
)
ret
=
-
EINTR
;
else
if
(
bio_data_dir
(
bio
)
==
READ
)
ret
=
bio_copy_to_iter
(
bio
,
bmd
->
iter
);
if
(
bmd
->
is_our_pages
)
bio_free_pages
(
bio
);
}
kfree
(
bmd
);
bio_put
(
bio
);
return
ret
;
}
/**
* bio_copy_user_iov - copy user data to bio
* @q: destination block queue
* @map_data: pointer to the rq_map_data holding pages (if necessary)
* @iter: iovec iterator
* @gfp_mask: memory allocation flags
*
* Prepares and returns a bio for indirect user io, bouncing data
* to/from kernel pages as necessary. Must be paired with
* call bio_uncopy_user() on io completion.
*/
struct
bio
*
bio_copy_user_iov
(
struct
request_queue
*
q
,
struct
rq_map_data
*
map_data
,
struct
iov_iter
*
iter
,
gfp_t
gfp_mask
)
{
struct
bio_map_data
*
bmd
;
struct
page
*
page
;
struct
bio
*
bio
;
int
i
=
0
,
ret
;
int
nr_pages
;
unsigned
int
len
=
iter
->
count
;
unsigned
int
offset
=
map_data
?
offset_in_page
(
map_data
->
offset
)
:
0
;
bmd
=
bio_alloc_map_data
(
iter
,
gfp_mask
);
if
(
!
bmd
)
return
ERR_PTR
(
-
ENOMEM
);
/*
* We need to do a deep copy of the iov_iter including the iovecs.
* The caller provided iov might point to an on-stack or otherwise
* shortlived one.
*/
bmd
->
is_our_pages
=
map_data
?
0
:
1
;
nr_pages
=
DIV_ROUND_UP
(
offset
+
len
,
PAGE_SIZE
);
if
(
nr_pages
>
BIO_MAX_PAGES
)
nr_pages
=
BIO_MAX_PAGES
;
ret
=
-
ENOMEM
;
bio
=
bio_kmalloc
(
gfp_mask
,
nr_pages
);
if
(
!
bio
)
goto
out_bmd
;
ret
=
0
;
if
(
map_data
)
{
nr_pages
=
1
<<
map_data
->
page_order
;
i
=
map_data
->
offset
/
PAGE_SIZE
;
}
while
(
len
)
{
unsigned
int
bytes
=
PAGE_SIZE
;
bytes
-=
offset
;
if
(
bytes
>
len
)
bytes
=
len
;
if
(
map_data
)
{
if
(
i
==
map_data
->
nr_entries
*
nr_pages
)
{
ret
=
-
ENOMEM
;
break
;
}
page
=
map_data
->
pages
[
i
/
nr_pages
];
page
+=
(
i
%
nr_pages
);
i
++
;
}
else
{
page
=
alloc_page
(
q
->
bounce_gfp
|
gfp_mask
);
if
(
!
page
)
{
ret
=
-
ENOMEM
;
break
;
}
}
if
(
bio_add_pc_page
(
q
,
bio
,
page
,
bytes
,
offset
)
<
bytes
)
break
;
len
-=
bytes
;
offset
=
0
;
}
if
(
ret
)
goto
cleanup
;
if
(
map_data
)
map_data
->
offset
+=
bio
->
bi_iter
.
bi_size
;
/*
* success
*/
if
(((
iter
->
type
&
WRITE
)
&&
(
!
map_data
||
!
map_data
->
null_mapped
))
||
(
map_data
&&
map_data
->
from_user
))
{
ret
=
bio_copy_from_iter
(
bio
,
iter
);
if
(
ret
)
goto
cleanup
;
}
else
{
iov_iter_advance
(
iter
,
bio
->
bi_iter
.
bi_size
);
}
bio
->
bi_private
=
bmd
;
if
(
map_data
&&
map_data
->
null_mapped
)
bio_set_flag
(
bio
,
BIO_NULL_MAPPED
);
return
bio
;
cleanup:
if
(
!
map_data
)
bio_free_pages
(
bio
);
bio_put
(
bio
);
out_bmd:
kfree
(
bmd
);
return
ERR_PTR
(
ret
);
}
/**
* bio_map_user_iov - map user iovec into bio
* @q: the struct request_queue for the bio
* @iter: iovec iterator
* @gfp_mask: memory allocation flags
*
* Map the user space address into a bio suitable for io to a block
* device. Returns an error pointer in case of error.
*/
struct
bio
*
bio_map_user_iov
(
struct
request_queue
*
q
,
struct
iov_iter
*
iter
,
gfp_t
gfp_mask
)
{
int
j
;
struct
bio
*
bio
;
int
ret
;
struct
bio_vec
*
bvec
;
if
(
!
iov_iter_count
(
iter
))
return
ERR_PTR
(
-
EINVAL
);
bio
=
bio_kmalloc
(
gfp_mask
,
iov_iter_npages
(
iter
,
BIO_MAX_PAGES
));
if
(
!
bio
)
return
ERR_PTR
(
-
ENOMEM
);
while
(
iov_iter_count
(
iter
))
{
struct
page
**
pages
;
ssize_t
bytes
;
size_t
offs
,
added
=
0
;
int
npages
;
bytes
=
iov_iter_get_pages_alloc
(
iter
,
&
pages
,
LONG_MAX
,
&
offs
);
if
(
unlikely
(
bytes
<=
0
))
{
ret
=
bytes
?
bytes
:
-
EFAULT
;
goto
out_unmap
;
}
npages
=
DIV_ROUND_UP
(
offs
+
bytes
,
PAGE_SIZE
);
if
(
unlikely
(
offs
&
queue_dma_alignment
(
q
)))
{
ret
=
-
EINVAL
;
j
=
0
;
}
else
{
for
(
j
=
0
;
j
<
npages
;
j
++
)
{
struct
page
*
page
=
pages
[
j
];
unsigned
int
n
=
PAGE_SIZE
-
offs
;
unsigned
short
prev_bi_vcnt
=
bio
->
bi_vcnt
;
if
(
n
>
bytes
)
n
=
bytes
;
if
(
!
bio_add_pc_page
(
q
,
bio
,
page
,
n
,
offs
))
break
;
/*
* check if vector was merged with previous
* drop page reference if needed
*/
if
(
bio
->
bi_vcnt
==
prev_bi_vcnt
)
put_page
(
page
);
added
+=
n
;
bytes
-=
n
;
offs
=
0
;
}
iov_iter_advance
(
iter
,
added
);
}
/*
* release the pages we didn't map into the bio, if any
*/
while
(
j
<
npages
)
put_page
(
pages
[
j
++
]);
kvfree
(
pages
);
/* couldn't stuff something into bio? */
if
(
bytes
)
break
;
}
bio_set_flag
(
bio
,
BIO_USER_MAPPED
);
/*
* subtle -- if bio_map_user_iov() ended up bouncing a bio,
* it would normally disappear when its bi_end_io is run.
* however, we need it for the unmap, so grab an extra
* reference to it
*/
bio_get
(
bio
);
return
bio
;
out_unmap:
bio_for_each_segment_all
(
bvec
,
bio
,
j
)
{
put_page
(
bvec
->
bv_page
);
}
bio_put
(
bio
);
return
ERR_PTR
(
ret
);
}
static
void
__bio_unmap_user
(
struct
bio
*
bio
)
{
struct
bio_vec
*
bvec
;
int
i
;
/*
* make sure we dirty pages we wrote to
*/
bio_for_each_segment_all
(
bvec
,
bio
,
i
)
{
if
(
bio_data_dir
(
bio
)
==
READ
)
set_page_dirty_lock
(
bvec
->
bv_page
);
put_page
(
bvec
->
bv_page
);
}
bio_put
(
bio
);
}
/**
* bio_unmap_user - unmap a bio
* @bio: the bio being unmapped
*
* Unmap a bio previously mapped by bio_map_user_iov(). Must be called from
* process context.
*
* bio_unmap_user() may sleep.
*/
void
bio_unmap_user
(
struct
bio
*
bio
)
{
__bio_unmap_user
(
bio
);
bio_put
(
bio
);
}
static
void
bio_map_kern_endio
(
struct
bio
*
bio
)
{
bio_put
(
bio
);
}
/**
* bio_map_kern - map kernel address into bio
* @q: the struct request_queue for the bio
* @data: pointer to buffer to map
* @len: length in bytes
* @gfp_mask: allocation flags for bio allocation
*
* Map the kernel address into a bio suitable for io to a block
* device. Returns an error pointer in case of error.
*/
struct
bio
*
bio_map_kern
(
struct
request_queue
*
q
,
void
*
data
,
unsigned
int
len
,
gfp_t
gfp_mask
)
{
unsigned
long
kaddr
=
(
unsigned
long
)
data
;
unsigned
long
end
=
(
kaddr
+
len
+
PAGE_SIZE
-
1
)
>>
PAGE_SHIFT
;
unsigned
long
start
=
kaddr
>>
PAGE_SHIFT
;
const
int
nr_pages
=
end
-
start
;
int
offset
,
i
;
struct
bio
*
bio
;
bio
=
bio_kmalloc
(
gfp_mask
,
nr_pages
);
if
(
!
bio
)
return
ERR_PTR
(
-
ENOMEM
);
offset
=
offset_in_page
(
kaddr
);
for
(
i
=
0
;
i
<
nr_pages
;
i
++
)
{
unsigned
int
bytes
=
PAGE_SIZE
-
offset
;
if
(
len
<=
0
)
break
;
if
(
bytes
>
len
)
bytes
=
len
;
if
(
bio_add_pc_page
(
q
,
bio
,
virt_to_page
(
data
),
bytes
,
offset
)
<
bytes
)
{
/* we don't support partial mappings */
bio_put
(
bio
);
return
ERR_PTR
(
-
EINVAL
);
}
data
+=
bytes
;
len
-=
bytes
;
offset
=
0
;
}
bio
->
bi_end_io
=
bio_map_kern_endio
;
return
bio
;
}
EXPORT_SYMBOL
(
bio_map_kern
);
static
void
bio_copy_kern_endio
(
struct
bio
*
bio
)
{
bio_free_pages
(
bio
);
bio_put
(
bio
);
}
static
void
bio_copy_kern_endio_read
(
struct
bio
*
bio
)
{
char
*
p
=
bio
->
bi_private
;
struct
bio_vec
*
bvec
;
int
i
;
bio_for_each_segment_all
(
bvec
,
bio
,
i
)
{
memcpy
(
p
,
page_address
(
bvec
->
bv_page
),
bvec
->
bv_len
);
p
+=
bvec
->
bv_len
;
}
bio_copy_kern_endio
(
bio
);
}
/**
* bio_copy_kern - copy kernel address into bio
* @q: the struct request_queue for the bio
* @data: pointer to buffer to copy
* @len: length in bytes
* @gfp_mask: allocation flags for bio and page allocation
* @reading: data direction is READ
*
* copy the kernel address into a bio suitable for io to a block
* device. Returns an error pointer in case of error.
*/
struct
bio
*
bio_copy_kern
(
struct
request_queue
*
q
,
void
*
data
,
unsigned
int
len
,
gfp_t
gfp_mask
,
int
reading
)
{
unsigned
long
kaddr
=
(
unsigned
long
)
data
;
unsigned
long
end
=
(
kaddr
+
len
+
PAGE_SIZE
-
1
)
>>
PAGE_SHIFT
;
unsigned
long
start
=
kaddr
>>
PAGE_SHIFT
;
struct
bio
*
bio
;
void
*
p
=
data
;
int
nr_pages
=
0
;
/*
* Overflow, abort
*/
if
(
end
<
start
)
return
ERR_PTR
(
-
EINVAL
);
nr_pages
=
end
-
start
;
bio
=
bio_kmalloc
(
gfp_mask
,
nr_pages
);
if
(
!
bio
)
return
ERR_PTR
(
-
ENOMEM
);
while
(
len
)
{
struct
page
*
page
;
unsigned
int
bytes
=
PAGE_SIZE
;
if
(
bytes
>
len
)
bytes
=
len
;
page
=
alloc_page
(
q
->
bounce_gfp
|
gfp_mask
);
if
(
!
page
)
goto
cleanup
;
if
(
!
reading
)
memcpy
(
page_address
(
page
),
p
,
bytes
);
if
(
bio_add_pc_page
(
q
,
bio
,
page
,
bytes
,
0
)
<
bytes
)
break
;
len
-=
bytes
;
p
+=
bytes
;
}
if
(
reading
)
{
bio
->
bi_end_io
=
bio_copy_kern_endio_read
;
bio
->
bi_private
=
data
;
}
else
{
bio
->
bi_end_io
=
bio_copy_kern_endio
;
}
return
bio
;
cleanup:
bio_free_pages
(
bio
);
bio_put
(
bio
);
return
ERR_PTR
(
-
ENOMEM
);
}
/*
* bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
* for performing direct-IO in BIOs.
*
* The problem is that we cannot run set_page_dirty() from interrupt context
* because the required locks are not interrupt-safe. So what we can do is to
* mark the pages dirty _before_ performing IO. And in interrupt context,
* check that the pages are still dirty. If so, fine. If not, redirty them
* in process context.
*
* We special-case compound pages here: normally this means reads into hugetlb
* pages. The logic in here doesn't really work right for compound pages
* because the VM does not uniformly chase down the head page in all cases.
* But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
* handle them at all. So we skip compound pages here at an early stage.
*
* Note that this code is very hard to test under normal circumstances because
* direct-io pins the pages with get_user_pages(). This makes
* is_page_cache_freeable return false, and the VM will not clean the pages.
* But other code (eg, flusher threads) could clean the pages if they are mapped
* pagecache.
*
* Simply disabling the call to bio_set_pages_dirty() is a good way to test the
* deferred bio dirtying paths.
*/
/*
* bio_set_pages_dirty() will mark all the bio's pages as dirty.
*/
void
bio_set_pages_dirty
(
struct
bio
*
bio
)
{
struct
bio_vec
*
bvec
;
int
i
;
bio_for_each_segment_all
(
bvec
,
bio
,
i
)
{
if
(
!
PageCompound
(
bvec
->
bv_page
))
set_page_dirty_lock
(
bvec
->
bv_page
);
}
}
EXPORT_SYMBOL_GPL
(
bio_set_pages_dirty
);
static
void
bio_release_pages
(
struct
bio
*
bio
)
{
struct
bio_vec
*
bvec
;
int
i
;
bio_for_each_segment_all
(
bvec
,
bio
,
i
)
put_page
(
bvec
->
bv_page
);
}
/*
* bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
* If they are, then fine. If, however, some pages are clean then they must
* have been written out during the direct-IO read. So we take another ref on
* the BIO and re-dirty the pages in process context.
*
* It is expected that bio_check_pages_dirty() will wholly own the BIO from
* here on. It will run one put_page() against each page and will run one
* bio_put() against the BIO.
*/
static
void
bio_dirty_fn
(
struct
work_struct
*
work
);
static
DECLARE_WORK
(
bio_dirty_work
,
bio_dirty_fn
);
static
DEFINE_SPINLOCK
(
bio_dirty_lock
);
static
struct
bio
*
bio_dirty_list
;
/*
* This runs in process context
*/
static
void
bio_dirty_fn
(
struct
work_struct
*
work
)
{
struct
bio
*
bio
,
*
next
;
spin_lock_irq
(
&
bio_dirty_lock
);
next
=
bio_dirty_list
;
bio_dirty_list
=
NULL
;
spin_unlock_irq
(
&
bio_dirty_lock
);
while
((
bio
=
next
)
!=
NULL
)
{
next
=
bio
->
bi_private
;
bio_set_pages_dirty
(
bio
);
bio_release_pages
(
bio
);
bio_put
(
bio
);
}
}
void
bio_check_pages_dirty
(
struct
bio
*
bio
)
{
struct
bio_vec
*
bvec
;
unsigned
long
flags
;
int
i
;
bio_for_each_segment_all
(
bvec
,
bio
,
i
)
{
if
(
!
PageDirty
(
bvec
->
bv_page
)
&&
!
PageCompound
(
bvec
->
bv_page
))
goto
defer
;
}
bio_release_pages
(
bio
);
bio_put
(
bio
);
return
;
defer:
spin_lock_irqsave
(
&
bio_dirty_lock
,
flags
);
bio
->
bi_private
=
bio_dirty_list
;
bio_dirty_list
=
bio
;
spin_unlock_irqrestore
(
&
bio_dirty_lock
,
flags
);
schedule_work
(
&
bio_dirty_work
);
}
EXPORT_SYMBOL_GPL
(
bio_check_pages_dirty
);
void
generic_start_io_acct
(
struct
request_queue
*
q
,
int
op
,
unsigned
long
sectors
,
struct
hd_struct
*
part
)
{
const
int
sgrp
=
op_stat_group
(
op
);
int
cpu
=
part_stat_lock
();
part_round_stats
(
q
,
cpu
,
part
);
part_stat_inc
(
cpu
,
part
,
ios
[
sgrp
]);
part_stat_add
(
cpu
,
part
,
sectors
[
sgrp
],
sectors
);
part_inc_in_flight
(
q
,
part
,
op_is_write
(
op
));
part_stat_unlock
();
}
EXPORT_SYMBOL
(
generic_start_io_acct
);
void
generic_end_io_acct
(
struct
request_queue
*
q
,
int
req_op
,
struct
hd_struct
*
part
,
unsigned
long
start_time
)
{
unsigned
long
duration
=
jiffies
-
start_time
;
const
int
sgrp
=
op_stat_group
(
req_op
);
int
cpu
=
part_stat_lock
();
part_stat_add
(
cpu
,
part
,
nsecs
[
sgrp
],
jiffies_to_nsecs
(
duration
));
part_round_stats
(
q
,
cpu
,
part
);
part_dec_in_flight
(
q
,
part
,
op_is_write
(
req_op
));
part_stat_unlock
();
}
EXPORT_SYMBOL
(
generic_end_io_acct
);
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
void
bio_flush_dcache_pages
(
struct
bio
*
bi
)
{
struct
bio_vec
bvec
;
struct
bvec_iter
iter
;
bio_for_each_segment
(
bvec
,
bi
,
iter
)
flush_dcache_page
(
bvec
.
bv_page
);
}
EXPORT_SYMBOL
(
bio_flush_dcache_pages
);
#endif
static
inline
bool
bio_remaining_done
(
struct
bio
*
bio
)
{
/*
* If we're not chaining, then ->__bi_remaining is always 1 and
* we always end io on the first invocation.
*/
if
(
!
bio_flagged
(
bio
,
BIO_CHAIN
))
return
true
;
BUG_ON
(
atomic_read
(
&
bio
->
__bi_remaining
)
<=
0
);
if
(
atomic_dec_and_test
(
&
bio
->
__bi_remaining
))
{
bio_clear_flag
(
bio
,
BIO_CHAIN
);
return
true
;
}
return
false
;
}
/**
* bio_endio - end I/O on a bio
* @bio: bio
*
* Description:
* bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
* way to end I/O on a bio. No one should call bi_end_io() directly on a
* bio unless they own it and thus know that it has an end_io function.
*
* bio_endio() can be called several times on a bio that has been chained
* using bio_chain(). The ->bi_end_io() function will only be called the
* last time. At this point the BLK_TA_COMPLETE tracing event will be
* generated if BIO_TRACE_COMPLETION is set.
**/
void
bio_endio
(
struct
bio
*
bio
)
{
again:
if
(
!
bio_remaining_done
(
bio
))
return
;
if
(
!
bio_integrity_endio
(
bio
))
return
;
if
(
bio
->
bi_disk
)
rq_qos_done_bio
(
bio
->
bi_disk
->
queue
,
bio
);
/*
* Need to have a real endio function for chained bios, otherwise
* various corner cases will break (like stacking block devices that
* save/restore bi_end_io) - however, we want to avoid unbounded
* recursion and blowing the stack. Tail call optimization would
* handle this, but compiling with frame pointers also disables
* gcc's sibling call optimization.
*/
if
(
bio
->
bi_end_io
==
bio_chain_endio
)
{
bio
=
__bio_chain_endio
(
bio
);
goto
again
;
}
if
(
bio
->
bi_disk
&&
bio_flagged
(
bio
,
BIO_TRACE_COMPLETION
))
{
trace_block_bio_complete
(
bio
->
bi_disk
->
queue
,
bio
,
blk_status_to_errno
(
bio
->
bi_status
));
bio_clear_flag
(
bio
,
BIO_TRACE_COMPLETION
);
}
blk_throtl_bio_endio
(
bio
);
/* release cgroup info */
bio_uninit
(
bio
);
if
(
bio
->
bi_end_io
)
bio
->
bi_end_io
(
bio
);
}
EXPORT_SYMBOL
(
bio_endio
);
/**
* bio_split - split a bio
* @bio: bio to split
* @sectors: number of sectors to split from the front of @bio
* @gfp: gfp mask
* @bs: bio set to allocate from
*
* Allocates and returns a new bio which represents @sectors from the start of
* @bio, and updates @bio to represent the remaining sectors.
*
* Unless this is a discard request the newly allocated bio will point
* to @bio's bi_io_vec; it is the caller's responsibility to ensure that
* @bio is not freed before the split.
*/
struct
bio
*
bio_split
(
struct
bio
*
bio
,
int
sectors
,
gfp_t
gfp
,
struct
bio_set
*
bs
)
{
struct
bio
*
split
;
BUG_ON
(
sectors
<=
0
);
BUG_ON
(
sectors
>=
bio_sectors
(
bio
));
split
=
bio_clone_fast
(
bio
,
gfp
,
bs
);
if
(
!
split
)
return
NULL
;
split
->
bi_iter
.
bi_size
=
sectors
<<
9
;
if
(
bio_integrity
(
split
))
bio_integrity_trim
(
split
);
bio_advance
(
bio
,
split
->
bi_iter
.
bi_size
);
bio
->
bi_iter
.
bi_done
=
0
;
if
(
bio_flagged
(
bio
,
BIO_TRACE_COMPLETION
))
bio_set_flag
(
split
,
BIO_TRACE_COMPLETION
);
return
split
;
}
EXPORT_SYMBOL
(
bio_split
);
/**
* bio_trim - trim a bio
* @bio: bio to trim
* @offset: number of sectors to trim from the front of @bio
* @size: size we want to trim @bio to, in sectors
*/
void
bio_trim
(
struct
bio
*
bio
,
int
offset
,
int
size
)
{
/* 'bio' is a cloned bio which we need to trim to match
* the given offset and size.
*/
size
<<=
9
;
if
(
offset
==
0
&&
size
==
bio
->
bi_iter
.
bi_size
)
return
;
bio_clear_flag
(
bio
,
BIO_SEG_VALID
);
bio_advance
(
bio
,
offset
<<
9
);
bio
->
bi_iter
.
bi_size
=
size
;
if
(
bio_integrity
(
bio
))
bio_integrity_trim
(
bio
);
}
EXPORT_SYMBOL_GPL
(
bio_trim
);
/*
* create memory pools for biovec's in a bio_set.
* use the global biovec slabs created for general use.
*/
int
biovec_init_pool
(
mempool_t
*
pool
,
int
pool_entries
)
{
struct
biovec_slab
*
bp
=
bvec_slabs
+
BVEC_POOL_MAX
;
return
mempool_init_slab_pool
(
pool
,
pool_entries
,
bp
->
slab
);
}
/*
* bioset_exit - exit a bioset initialized with bioset_init()
*
* May be called on a zeroed but uninitialized bioset (i.e. allocated with
* kzalloc()).
*/
void
bioset_exit
(
struct
bio_set
*
bs
)
{
if
(
bs
->
rescue_workqueue
)
destroy_workqueue
(
bs
->
rescue_workqueue
);
bs
->
rescue_workqueue
=
NULL
;
mempool_exit
(
&
bs
->
bio_pool
);
mempool_exit
(
&
bs
->
bvec_pool
);
bioset_integrity_free
(
bs
);
if
(
bs
->
bio_slab
)
bio_put_slab
(
bs
);
bs
->
bio_slab
=
NULL
;
}
EXPORT_SYMBOL
(
bioset_exit
);
/**
* bioset_init - Initialize a bio_set
* @bs: pool to initialize
* @pool_size: Number of bio and bio_vecs to cache in the mempool
* @front_pad: Number of bytes to allocate in front of the returned bio
* @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS
* and %BIOSET_NEED_RESCUER
*
* Description:
* Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
* to ask for a number of bytes to be allocated in front of the bio.
* Front pad allocation is useful for embedding the bio inside
* another structure, to avoid allocating extra data to go with the bio.
* Note that the bio must be embedded at the END of that structure always,
* or things will break badly.
* If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
* for allocating iovecs. This pool is not needed e.g. for bio_clone_fast().
* If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
* dispatch queued requests when the mempool runs out of space.
*
*/
int
bioset_init
(
struct
bio_set
*
bs
,
unsigned
int
pool_size
,
unsigned
int
front_pad
,
int
flags
)
{
unsigned
int
back_pad
=
BIO_INLINE_VECS
*
sizeof
(
struct
bio_vec
);
bs
->
front_pad
=
front_pad
;
spin_lock_init
(
&
bs
->
rescue_lock
);
bio_list_init
(
&
bs
->
rescue_list
);
INIT_WORK
(
&
bs
->
rescue_work
,
bio_alloc_rescue
);
bs
->
bio_slab
=
bio_find_or_create_slab
(
front_pad
+
back_pad
);
if
(
!
bs
->
bio_slab
)
return
-
ENOMEM
;
if
(
mempool_init_slab_pool
(
&
bs
->
bio_pool
,
pool_size
,
bs
->
bio_slab
))
goto
bad
;
if
((
flags
&
BIOSET_NEED_BVECS
)
&&
biovec_init_pool
(
&
bs
->
bvec_pool
,
pool_size
))
goto
bad
;
if
(
!
(
flags
&
BIOSET_NEED_RESCUER
))
return
0
;
bs
->
rescue_workqueue
=
alloc_workqueue
(
"bioset"
,
WQ_MEM_RECLAIM
,
0
);
if
(
!
bs
->
rescue_workqueue
)
goto
bad
;
return
0
;
bad:
bioset_exit
(
bs
);
return
-
ENOMEM
;
}
EXPORT_SYMBOL
(
bioset_init
);
/*
* Initialize and setup a new bio_set, based on the settings from
* another bio_set.
*/
int
bioset_init_from_src
(
struct
bio_set
*
bs
,
struct
bio_set
*
src
)
{
int
flags
;
flags
=
0
;
if
(
src
->
bvec_pool
.
min_nr
)
flags
|=
BIOSET_NEED_BVECS
;
if
(
src
->
rescue_workqueue
)
flags
|=
BIOSET_NEED_RESCUER
;
return
bioset_init
(
bs
,
src
->
bio_pool
.
min_nr
,
src
->
front_pad
,
flags
);
}
EXPORT_SYMBOL
(
bioset_init_from_src
);
#ifdef CONFIG_BLK_CGROUP
#ifdef CONFIG_MEMCG
/**
* bio_associate_blkcg_from_page - associate a bio with the page's blkcg
* @bio: target bio
* @page: the page to lookup the blkcg from
*
* Associate @bio with the blkcg from @page's owning memcg. This works like
* every other associate function wrt references.
*/
int
bio_associate_blkcg_from_page
(
struct
bio
*
bio
,
struct
page
*
page
)
{
struct
cgroup_subsys_state
*
blkcg_css
;
if
(
unlikely
(
bio
->
bi_css
))
return
-
EBUSY
;
if
(
!
page
->
mem_cgroup
)
return
0
;
blkcg_css
=
cgroup_get_e_css
(
page
->
mem_cgroup
->
css
.
cgroup
,
&
io_cgrp_subsys
);
bio
->
bi_css
=
blkcg_css
;
return
0
;
}
#endif
/* CONFIG_MEMCG */
/**
* bio_associate_blkcg - associate a bio with the specified blkcg
* @bio: target bio
* @blkcg_css: css of the blkcg to associate
*
* Associate @bio with the blkcg specified by @blkcg_css. Block layer will
* treat @bio as if it were issued by a task which belongs to the blkcg.
*
* This function takes an extra reference of @blkcg_css which will be put
* when @bio is released. The caller must own @bio and is responsible for
* synchronizing calls to this function.
*/
int
bio_associate_blkcg
(
struct
bio
*
bio
,
struct
cgroup_subsys_state
*
blkcg_css
)
{
if
(
unlikely
(
bio
->
bi_css
))
return
-
EBUSY
;
css_get
(
blkcg_css
);
bio
->
bi_css
=
blkcg_css
;
return
0
;
}
EXPORT_SYMBOL_GPL
(
bio_associate_blkcg
);
/**
* bio_associate_blkg - associate a bio with the specified blkg
* @bio: target bio
* @blkg: the blkg to associate
*
* Associate @bio with the blkg specified by @blkg. This is the queue specific
* blkcg information associated with the @bio, a reference will be taken on the
* @blkg and will be freed when the bio is freed.
*/
int
bio_associate_blkg
(
struct
bio
*
bio
,
struct
blkcg_gq
*
blkg
)
{
if
(
unlikely
(
bio
->
bi_blkg
))
return
-
EBUSY
;
if
(
!
blkg_try_get
(
blkg
))
return
-
ENODEV
;
bio
->
bi_blkg
=
blkg
;
return
0
;
}
/**
* bio_disassociate_task - undo bio_associate_current()
* @bio: target bio
*/
void
bio_disassociate_task
(
struct
bio
*
bio
)
{
if
(
bio
->
bi_ioc
)
{
put_io_context
(
bio
->
bi_ioc
);
bio
->
bi_ioc
=
NULL
;
}
if
(
bio
->
bi_css
)
{
css_put
(
bio
->
bi_css
);
bio
->
bi_css
=
NULL
;
}
if
(
bio
->
bi_blkg
)
{
blkg_put
(
bio
->
bi_blkg
);
bio
->
bi_blkg
=
NULL
;
}
}
/**
* bio_clone_blkcg_association - clone blkcg association from src to dst bio
* @dst: destination bio
* @src: source bio
*/
void
bio_clone_blkcg_association
(
struct
bio
*
dst
,
struct
bio
*
src
)
{
if
(
src
->
bi_css
)
WARN_ON
(
bio_associate_blkcg
(
dst
,
src
->
bi_css
));
}
EXPORT_SYMBOL_GPL
(
bio_clone_blkcg_association
);
#endif
/* CONFIG_BLK_CGROUP */
static
void
__init
biovec_init_slabs
(
void
)
{
int
i
;
for
(
i
=
0
;
i
<
BVEC_POOL_NR
;
i
++
)
{
int
size
;
struct
biovec_slab
*
bvs
=
bvec_slabs
+
i
;
if
(
bvs
->
nr_vecs
<=
BIO_INLINE_VECS
)
{
bvs
->
slab
=
NULL
;
continue
;
}
size
=
bvs
->
nr_vecs
*
sizeof
(
struct
bio_vec
);
bvs
->
slab
=
kmem_cache_create
(
bvs
->
name
,
size
,
0
,
SLAB_HWCACHE_ALIGN
|
SLAB_PANIC
,
NULL
);
}
}
static
int
__init
init_bio
(
void
)
{
bio_slab_max
=
2
;
bio_slab_nr
=
0
;
bio_slabs
=
kcalloc
(
bio_slab_max
,
sizeof
(
struct
bio_slab
),
GFP_KERNEL
);
if
(
!
bio_slabs
)
panic
(
"bio: can't allocate bios
\n
"
);
bio_integrity_init
();
biovec_init_slabs
();
if
(
bioset_init
(
&
fs_bio_set
,
BIO_POOL_SIZE
,
0
,
BIOSET_NEED_BVECS
))
panic
(
"bio: can't allocate bios
\n
"
);
if
(
bioset_integrity_create
(
&
fs_bio_set
,
BIO_POOL_SIZE
))
panic
(
"bio: can't create integrity pool
\n
"
);
return
0
;
}
subsys_initcall
(
init_bio
);
src/drivers/block/loop.c
0 → 100644
View file @
f1301897
/*
* linux/drivers/block/loop.c
*
* Written by Theodore Ts'o, 3/29/93
*
* Copyright 1993 by Theodore Ts'o. Redistribution of this file is
* permitted under the GNU General Public License.
*
* DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993
* more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996
*
* Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
* Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
*
* Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
*
* Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
*
* Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
*
* Loadable modules and other fixes by AK, 1998
*
* Make real block number available to downstream transfer functions, enables
* CBC (and relatives) mode encryption requiring unique IVs per data block.
* Reed H. Petty, rhp@draper.net
*
* Maximum number of loop devices now dynamic via max_loop module parameter.
* Russell Kroll <rkroll@exploits.org> 19990701
*
* Maximum number of loop devices when compiled-in now selectable by passing
* max_loop=<1-255> to the kernel on boot.
* Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999
*
* Completely rewrite request handling to be make_request_fn style and
* non blocking, pushing work to a helper thread. Lots of fixes from
* Al Viro too.
* Jens Axboe <axboe@suse.de>, Nov 2000
*
* Support up to 256 loop devices
* Heinz Mauelshagen <mge@sistina.com>, Feb 2002
*
* Support for falling back on the write file operation when the address space
* operations write_begin is not available on the backing filesystem.
* Anton Altaparmakov, 16 Feb 2005
*
* Still To Fix:
* - Advisory locking is ignored here.
* - Should use an own CAP_* category instead of CAP_SYS_ADMIN
*
*/
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/major.h>
#include <linux/wait.h>
#include <linux/blkdev.h>
#include <linux/blkpg.h>
#include <linux/init.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/suspend.h>
#include <linux/freezer.h>
#include <linux/mutex.h>
#include <linux/writeback.h>
#include <linux/completion.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/splice.h>
#include <linux/sysfs.h>
#include <linux/miscdevice.h>
#include <linux/falloc.h>
#include <linux/uio.h>
#include <linux/ioprio.h>
#include "loop.h"
#include <linux/uaccess.h>
static
DEFINE_IDR
(
loop_index_idr
);
static
DEFINE_MUTEX
(
loop_index_mutex
);
static
int
max_part
;
static
int
part_shift
;
static
int
transfer_xor
(
struct
loop_device
*
lo
,
int
cmd
,
struct
page
*
raw_page
,
unsigned
raw_off
,
struct
page
*
loop_page
,
unsigned
loop_off
,
int
size
,
sector_t
real_block
)
{
char
*
raw_buf
=
kmap_atomic
(
raw_page
)
+
raw_off
;
char
*
loop_buf
=
kmap_atomic
(
loop_page
)
+
loop_off
;
char
*
in
,
*
out
,
*
key
;
int
i
,
keysize
;
if
(
cmd
==
READ
)
{
in
=
raw_buf
;
out
=
loop_buf
;
}
else
{
in
=
loop_buf
;
out
=
raw_buf
;
}
key
=
lo
->
lo_encrypt_key
;
keysize
=
lo
->
lo_encrypt_key_size
;
for
(
i
=
0
;
i
<
size
;
i
++
)
*
out
++
=
*
in
++
^
key
[(
i
&
511
)
%
keysize
];
kunmap_atomic
(
loop_buf
);
kunmap_atomic
(
raw_buf
);
cond_resched
();
return
0
;
}
static
int
xor_init
(
struct
loop_device
*
lo
,
const
struct
loop_info64
*
info
)
{
if
(
unlikely
(
info
->
lo_encrypt_key_size
<=
0
))
return
-
EINVAL
;
return
0
;
}
static
struct
loop_func_table
none_funcs
=
{
.
number
=
LO_CRYPT_NONE
,
};
static
struct
loop_func_table
xor_funcs
=
{
.
number
=
LO_CRYPT_XOR
,
.
transfer
=
transfer_xor
,
.
init
=
xor_init
};
/* xfer_funcs[0] is special - its release function is never called */
static
struct
loop_func_table
*
xfer_funcs
[
MAX_LO_CRYPT
]
=
{
&
none_funcs
,
&
xor_funcs
};
static
loff_t
get_size
(
loff_t
offset
,
loff_t
sizelimit
,
struct
file
*
file
)
{
loff_t
loopsize
;
/* Compute loopsize in bytes */
loopsize
=
i_size_read
(
file
->
f_mapping
->
host
);
if
(
offset
>
0
)
loopsize
-=
offset
;
/* offset is beyond i_size, weird but possible */
if
(
loopsize
<
0
)
return
0
;
if
(
sizelimit
>
0
&&
sizelimit
<
loopsize
)
loopsize
=
sizelimit
;
/*
* Unfortunately, if we want to do I/O on the device,
* the number of 512-byte sectors has to fit into a sector_t.
*/
return
loopsize
>>
9
;
}
static
loff_t
get_loop_size
(
struct
loop_device
*
lo
,
struct
file
*
file
)
{
return
get_size
(
lo
->
lo_offset
,
lo
->
lo_sizelimit
,
file
);
}
static
void
__loop_update_dio
(
struct
loop_device
*
lo
,
bool
dio
)
{
struct
file
*
file
=
lo
->
lo_backing_file
;
struct
address_space
*
mapping
=
file
->
f_mapping
;
struct
inode
*
inode
=
mapping
->
host
;
unsigned
short
sb_bsize
=
0
;
unsigned
dio_align
=
0
;
bool
use_dio
;
if
(
inode
->
i_sb
->
s_bdev
)
{
sb_bsize
=
bdev_logical_block_size
(
inode
->
i_sb
->
s_bdev
);
dio_align
=
sb_bsize
-
1
;
}
/*
* We support direct I/O only if lo_offset is aligned with the
* logical I/O size of backing device, and the logical block
* size of loop is bigger than the backing device's and the loop
* needn't transform transfer.
*
* TODO: the above condition may be loosed in the future, and
* direct I/O may be switched runtime at that time because most
* of requests in sane applications should be PAGE_SIZE aligned
*/
if
(
dio
)
{
if
(
queue_logical_block_size
(
lo
->
lo_queue
)
>=
sb_bsize
&&
!
(
lo
->
lo_offset
&
dio_align
)
&&
mapping
->
a_ops
->
direct_IO
&&
!
lo
->
transfer
)
use_dio
=
true
;
else
use_dio
=
false
;
}
else
{
use_dio
=
false
;
}
if
(
lo
->
use_dio
==
use_dio
)
return
;
/* flush dirty pages before changing direct IO */
vfs_fsync
(
file
,
0
);
/*
* The flag of LO_FLAGS_DIRECT_IO is handled similarly with
* LO_FLAGS_READ_ONLY, both are set from kernel, and losetup
* will get updated by ioctl(LOOP_GET_STATUS)
*/
blk_mq_freeze_queue
(
lo
->
lo_queue
);
lo
->
use_dio
=
use_dio
;
if
(
use_dio
)
{
blk_queue_flag_clear
(
QUEUE_FLAG_NOMERGES
,
lo
->
lo_queue
);
lo
->
lo_flags
|=
LO_FLAGS_DIRECT_IO
;
}
else
{
blk_queue_flag_set
(
QUEUE_FLAG_NOMERGES
,
lo
->
lo_queue
);
lo
->
lo_flags
&=
~
LO_FLAGS_DIRECT_IO
;
}
blk_mq_unfreeze_queue
(
lo
->
lo_queue
);
}
static
int
figure_loop_size
(
struct
loop_device
*
lo
,
loff_t
offset
,
loff_t
sizelimit
)
{
loff_t
size
=
get_size
(
offset
,
sizelimit
,
lo
->
lo_backing_file
);
sector_t
x
=
(
sector_t
)
size
;
struct
block_device
*
bdev
=
lo
->
lo_device
;
if
(
unlikely
((
loff_t
)
x
!=
size
))
return
-
EFBIG
;
if
(
lo
->
lo_offset
!=
offset
)
lo
->
lo_offset
=
offset
;
if
(
lo
->
lo_sizelimit
!=
sizelimit
)
lo
->
lo_sizelimit
=
sizelimit
;
set_capacity
(
lo
->
lo_disk
,
x
);
bd_set_size
(
bdev
,
(
loff_t
)
get_capacity
(
bdev
->
bd_disk
)
<<
9
);
/* let user-space know about the new size */
kobject_uevent
(
&
disk_to_dev
(
bdev
->
bd_disk
)
->
kobj
,
KOBJ_CHANGE
);
return
0
;
}
static
inline
int
lo_do_transfer
(
struct
loop_device
*
lo
,
int
cmd
,
struct
page
*
rpage
,
unsigned
roffs
,
struct
page
*
lpage
,
unsigned
loffs
,
int
size
,
sector_t
rblock
)
{
int
ret
;
ret
=
lo
->
transfer
(
lo
,
cmd
,
rpage
,
roffs
,
lpage
,
loffs
,
size
,
rblock
);
if
(
likely
(
!
ret
))
return
0
;
printk_ratelimited
(
KERN_ERR
"loop: Transfer error at byte offset %llu, length %i.
\n
"
,
(
unsigned
long
long
)
rblock
<<
9
,
size
);
return
ret
;
}
static
int
lo_write_bvec
(
struct
file
*
file
,
struct
bio_vec
*
bvec
,
loff_t
*
ppos
)
{
struct
iov_iter
i
;
ssize_t
bw
;
iov_iter_bvec
(
&
i
,
ITER_BVEC
|
WRITE
,
bvec
,
1
,
bvec
->
bv_len
);
file_start_write
(
file
);
bw
=
vfs_iter_write
(
file
,
&
i
,
ppos
,
0
);
file_end_write
(
file
);
if
(
likely
(
bw
==
bvec
->
bv_len
))
return
0
;
printk_ratelimited
(
KERN_ERR
"loop: Write error at byte offset %llu, length %i.
\n
"
,
(
unsigned
long
long
)
*
ppos
,
bvec
->
bv_len
);
if
(
bw
>=
0
)
bw
=
-
EIO
;
return
bw
;
}
static
int
lo_write_simple
(
struct
loop_device
*
lo
,
struct
request
*
rq
,
loff_t
pos
)
{
struct
bio_vec
bvec
;
struct
req_iterator
iter
;
int
ret
=
0
;
rq_for_each_segment
(
bvec
,
rq
,
iter
)
{
ret
=
lo_write_bvec
(
lo
->
lo_backing_file
,
&
bvec
,
&
pos
);
if
(
ret
<
0
)
break
;
cond_resched
();
}
return
ret
;
}
/*
* This is the slow, transforming version that needs to double buffer the
* data as it cannot do the transformations in place without having direct
* access to the destination pages of the backing file.
*/
static
int
lo_write_transfer
(
struct
loop_device
*
lo
,
struct
request
*
rq
,
loff_t
pos
)
{
struct
bio_vec
bvec
,
b
;
struct
req_iterator
iter
;
struct
page
*
page
;
int
ret
=
0
;
page
=
alloc_page
(
GFP_NOIO
);
if
(
unlikely
(
!
page
))
return
-
ENOMEM
;
rq_for_each_segment
(
bvec
,
rq
,
iter
)
{
ret
=
lo_do_transfer
(
lo
,
WRITE
,
page
,
0
,
bvec
.
bv_page
,
bvec
.
bv_offset
,
bvec
.
bv_len
,
pos
>>
9
);
if
(
unlikely
(
ret
))
break
;
b
.
bv_page
=
page
;
b
.
bv_offset
=
0
;
b
.
bv_len
=
bvec
.
bv_len
;
ret
=
lo_write_bvec
(
lo
->
lo_backing_file
,
&
b
,
&
pos
);
if
(
ret
<
0
)
break
;
}
__free_page
(
page
);
return
ret
;
}
static
int
lo_read_simple
(
struct
loop_device
*
lo
,
struct
request
*
rq
,
loff_t
pos
)
{
struct
bio_vec
bvec
;
struct
req_iterator
iter
;
struct
iov_iter
i
;
ssize_t
len
;
rq_for_each_segment
(
bvec
,
rq
,
iter
)
{
iov_iter_bvec
(
&
i
,
ITER_BVEC
,
&
bvec
,
1
,
bvec
.
bv_len
);
len
=
vfs_iter_read
(
lo
->
lo_backing_file
,
&
i
,
&
pos
,
0
);
if
(
len
<
0
)
return
len
;
flush_dcache_page
(
bvec
.
bv_page
);
if
(
len
!=
bvec
.
bv_len
)
{
struct
bio
*
bio
;
__rq_for_each_bio
(
bio
,
rq
)
zero_fill_bio
(
bio
);
break
;
}
cond_resched
();
}
return
0
;
}
static
int
lo_read_transfer
(
struct
loop_device
*
lo
,
struct
request
*
rq
,
loff_t
pos
)
{
struct
bio_vec
bvec
,
b
;
struct
req_iterator
iter
;
struct
iov_iter
i
;
struct
page
*
page
;
ssize_t
len
;
int
ret
=
0
;
page
=
alloc_page
(
GFP_NOIO
);
if
(
unlikely
(
!
page
))
return
-
ENOMEM
;
rq_for_each_segment
(
bvec
,
rq
,
iter
)
{
loff_t
offset
=
pos
;
b
.
bv_page
=
page
;
b
.
bv_offset
=
0
;
b
.
bv_len
=
bvec
.
bv_len
;
iov_iter_bvec
(
&
i
,
ITER_BVEC
,
&
b
,
1
,
b
.
bv_len
);
len
=
vfs_iter_read
(
lo
->
lo_backing_file
,
&
i
,
&
pos
,
0
);
if
(
len
<
0
)
{
ret
=
len
;
goto
out_free_page
;
}
ret
=
lo_do_transfer
(
lo
,
READ
,
page
,
0
,
bvec
.
bv_page
,
bvec
.
bv_offset
,
len
,
offset
>>
9
);
if
(
ret
)
goto
out_free_page
;
flush_dcache_page
(
bvec
.
bv_page
);
if
(
len
!=
bvec
.
bv_len
)
{
struct
bio
*
bio
;
__rq_for_each_bio
(
bio
,
rq
)
zero_fill_bio
(
bio
);
break
;
}
}
ret
=
0
;
out_free_page:
__free_page
(
page
);
return
ret
;
}
static
int
lo_discard
(
struct
loop_device
*
lo
,
struct
request
*
rq
,
loff_t
pos
)
{
/*
* We use punch hole to reclaim the free space used by the
* image a.k.a. discard. However we do not support discard if
* encryption is enabled, because it may give an attacker
* useful information.
*/
struct
file
*
file
=
lo
->
lo_backing_file
;
int
mode
=
FALLOC_FL_PUNCH_HOLE
|
FALLOC_FL_KEEP_SIZE
;
int
ret
;
if
((
!
file
->
f_op
->
fallocate
)
||
lo
->
lo_encrypt_key_size
)
{
ret
=
-
EOPNOTSUPP
;
goto
out
;
}
ret
=
file
->
f_op
->
fallocate
(
file
,
mode
,
pos
,
blk_rq_bytes
(
rq
));
if
(
unlikely
(
ret
&&
ret
!=
-
EINVAL
&&
ret
!=
-
EOPNOTSUPP
))
ret
=
-
EIO
;
out:
return
ret
;
}
static
int
lo_req_flush
(
struct
loop_device
*
lo
,
struct
request
*
rq
)
{
struct
file
*
file
=
lo
->
lo_backing_file
;
int
ret
=
vfs_fsync
(
file
,
0
);
if
(
unlikely
(
ret
&&
ret
!=
-
EINVAL
))
ret
=
-
EIO
;
return
ret
;
}
static
void
lo_complete_rq
(
struct
request
*
rq
)
{
struct
loop_cmd
*
cmd
=
blk_mq_rq_to_pdu
(
rq
);
blk_status_t
ret
=
BLK_STS_OK
;
if
(
!
cmd
->
use_aio
||
cmd
->
ret
<
0
||
cmd
->
ret
==
blk_rq_bytes
(
rq
)
||
req_op
(
rq
)
!=
REQ_OP_READ
)
{
if
(
cmd
->
ret
<
0
)
ret
=
BLK_STS_IOERR
;
goto
end_io
;
}
/*
* Short READ - if we got some data, advance our request and
* retry it. If we got no data, end the rest with EIO.
*/
if
(
cmd
->
ret
)
{
blk_update_request
(
rq
,
BLK_STS_OK
,
cmd
->
ret
);
cmd
->
ret
=
0
;
blk_mq_requeue_request
(
rq
,
true
);
}
else
{
if
(
cmd
->
use_aio
)
{
struct
bio
*
bio
=
rq
->
bio
;
while
(
bio
)
{
zero_fill_bio
(
bio
);
bio
=
bio
->
bi_next
;
}
}
ret
=
BLK_STS_IOERR
;
end_io:
blk_mq_end_request
(
rq
,
ret
);
}
}
static
void
lo_rw_aio_do_completion
(
struct
loop_cmd
*
cmd
)
{
struct
request
*
rq
=
blk_mq_rq_from_pdu
(
cmd
);
if
(
!
atomic_dec_and_test
(
&
cmd
->
ref
))
return
;
kfree
(
cmd
->
bvec
);
cmd
->
bvec
=
NULL
;
blk_mq_complete_request
(
rq
);
}
static
void
lo_rw_aio_complete
(
struct
kiocb
*
iocb
,
long
ret
,
long
ret2
)
{
struct
loop_cmd
*
cmd
=
container_of
(
iocb
,
struct
loop_cmd
,
iocb
);
if
(
cmd
->
css
)
css_put
(
cmd
->
css
);
cmd
->
ret
=
ret
;
lo_rw_aio_do_completion
(
cmd
);
}
static
int
lo_rw_aio
(
struct
loop_device
*
lo
,
struct
loop_cmd
*
cmd
,
loff_t
pos
,
bool
rw
)
{
struct
iov_iter
iter
;
struct
bio_vec
*
bvec
;
struct
request
*
rq
=
blk_mq_rq_from_pdu
(
cmd
);
struct
bio
*
bio
=
rq
->
bio
;
struct
file
*
file
=
lo
->
lo_backing_file
;
unsigned
int
offset
;
int
segments
=
0
;
int
ret
;
if
(
rq
->
bio
!=
rq
->
biotail
)
{
struct
req_iterator
iter
;
struct
bio_vec
tmp
;
__rq_for_each_bio
(
bio
,
rq
)
segments
+=
bio_segments
(
bio
);
bvec
=
kmalloc_array
(
segments
,
sizeof
(
struct
bio_vec
),
GFP_NOIO
);
if
(
!
bvec
)
return
-
EIO
;
cmd
->
bvec
=
bvec
;
/*
* The bios of the request may be started from the middle of
* the 'bvec' because of bio splitting, so we can't directly
* copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
* API will take care of all details for us.
*/
rq_for_each_segment
(
tmp
,
rq
,
iter
)
{
*
bvec
=
tmp
;
bvec
++
;
}
bvec
=
cmd
->
bvec
;
offset
=
0
;
}
else
{
/*
* Same here, this bio may be started from the middle of the
* 'bvec' because of bio splitting, so offset from the bvec
* must be passed to iov iterator
*/
offset
=
bio
->
bi_iter
.
bi_bvec_done
;
bvec
=
__bvec_iter_bvec
(
bio
->
bi_io_vec
,
bio
->
bi_iter
);
segments
=
bio_segments
(
bio
);
}
atomic_set
(
&
cmd
->
ref
,
2
);
iov_iter_bvec
(
&
iter
,
ITER_BVEC
|
rw
,
bvec
,
segments
,
blk_rq_bytes
(
rq
));
iter
.
iov_offset
=
offset
;
cmd
->
iocb
.
ki_pos
=
pos
;
cmd
->
iocb
.
ki_filp
=
file
;
cmd
->
iocb
.
ki_complete
=
lo_rw_aio_complete
;
cmd
->
iocb
.
ki_flags
=
IOCB_DIRECT
;
cmd
->
iocb
.
ki_ioprio
=
IOPRIO_PRIO_VALUE
(
IOPRIO_CLASS_NONE
,
0
);
if
(
cmd
->
css
)
kthread_associate_blkcg
(
cmd
->
css
);
if
(
rw
==
WRITE
)
ret
=
call_write_iter
(
file
,
&
cmd
->
iocb
,
&
iter
);
else
ret
=
call_read_iter
(
file
,
&
cmd
->
iocb
,
&
iter
);
lo_rw_aio_do_completion
(
cmd
);
kthread_associate_blkcg
(
NULL
);
if
(
ret
!=
-
EIOCBQUEUED
)
cmd
->
iocb
.
ki_complete
(
&
cmd
->
iocb
,
ret
,
0
);
return
0
;
}
static
int
do_req_filebacked
(
struct
loop_device
*
lo
,
struct
request
*
rq
)
{
struct
loop_cmd
*
cmd
=
blk_mq_rq_to_pdu
(
rq
);
loff_t
pos
=
((
loff_t
)
blk_rq_pos
(
rq
)
<<
9
)
+
lo
->
lo_offset
;
/*
* lo_write_simple and lo_read_simple should have been covered
* by io submit style function like lo_rw_aio(), one blocker
* is that lo_read_simple() need to call flush_dcache_page after
* the page is written from kernel, and it isn't easy to handle
* this in io submit style function which submits all segments
* of the req at one time. And direct read IO doesn't need to
* run flush_dcache_page().
*/
switch
(
req_op
(
rq
))
{
case
REQ_OP_FLUSH
:
return
lo_req_flush
(
lo
,
rq
);
case
REQ_OP_DISCARD
:
case
REQ_OP_WRITE_ZEROES
:
return
lo_discard
(
lo
,
rq
,
pos
);
case
REQ_OP_WRITE
:
if
(
lo
->
transfer
)
return
lo_write_transfer
(
lo
,
rq
,
pos
);
else
if
(
cmd
->
use_aio
)
return
lo_rw_aio
(
lo
,
cmd
,
pos
,
WRITE
);
else
return
lo_write_simple
(
lo
,
rq
,
pos
);
case
REQ_OP_READ
:
if
(
lo
->
transfer
)
return
lo_read_transfer
(
lo
,
rq
,
pos
);
else
if
(
cmd
->
use_aio
)
return
lo_rw_aio
(
lo
,
cmd
,
pos
,
READ
);
else
return
lo_read_simple
(
lo
,
rq
,
pos
);
default:
WARN_ON_ONCE
(
1
);
return
-
EIO
;
break
;
}
}
static
inline
void
loop_update_dio
(
struct
loop_device
*
lo
)
{
__loop_update_dio
(
lo
,
io_is_direct
(
lo
->
lo_backing_file
)
|
lo
->
use_dio
);
}
static
void
loop_reread_partitions
(
struct
loop_device
*
lo
,
struct
block_device
*
bdev
)
{
int
rc
;
/*
* bd_mutex has been held already in release path, so don't
* acquire it if this function is called in such case.
*
* If the reread partition isn't from release path, lo_refcnt
* must be at least one and it can only become zero when the
* current holder is released.
*/
if
(
!
atomic_read
(
&
lo
->
lo_refcnt
))
rc
=
__blkdev_reread_part
(
bdev
);
else
rc
=
blkdev_reread_part
(
bdev
);
if
(
rc
)
pr_warn
(
"%s: partition scan of loop%d (%s) failed (rc=%d)
\n
"
,
__func__
,
lo
->
lo_number
,
lo
->
lo_file_name
,
rc
);
}
static
inline
int
is_loop_device
(
struct
file
*
file
)
{
struct
inode
*
i
=
file
->
f_mapping
->
host
;
return
i
&&
S_ISBLK
(
i
->
i_mode
)
&&
MAJOR
(
i
->
i_rdev
)
==
LOOP_MAJOR
;
}
static
int
loop_validate_file
(
struct
file
*
file
,
struct
block_device
*
bdev
)
{
struct
inode
*
inode
=
file
->
f_mapping
->
host
;
struct
file
*
f
=
file
;
/* Avoid recursion */
while
(
is_loop_device
(
f
))
{
struct
loop_device
*
l
;
if
(
f
->
f_mapping
->
host
->
i_bdev
==
bdev
)
return
-
EBADF
;
l
=
f
->
f_mapping
->
host
->
i_bdev
->
bd_disk
->
private_data
;
if
(
l
->
lo_state
==
Lo_unbound
)
{
return
-
EINVAL
;
}
f
=
l
->
lo_backing_file
;
}
if
(
!
S_ISREG
(
inode
->
i_mode
)
&&
!
S_ISBLK
(
inode
->
i_mode
))
return
-
EINVAL
;
return
0
;
}
/*
* loop_change_fd switched the backing store of a loopback device to
* a new file. This is useful for operating system installers to free up
* the original file and in High Availability environments to switch to
* an alternative location for the content in case of server meltdown.
* This can only work if the loop device is used read-only, and if the
* new backing store is the same size and type as the old backing store.
*/
static
int
loop_change_fd
(
struct
loop_device
*
lo
,
struct
block_device
*
bdev
,
unsigned
int
arg
)
{
struct
file
*
file
,
*
old_file
;
int
error
;
error
=
-
ENXIO
;
if
(
lo
->
lo_state
!=
Lo_bound
)
goto
out
;
/* the loop device has to be read-only */
error
=
-
EINVAL
;
if
(
!
(
lo
->
lo_flags
&
LO_FLAGS_READ_ONLY
))
goto
out
;
error
=
-
EBADF
;
file
=
fget
(
arg
);
if
(
!
file
)
goto
out
;
error
=
loop_validate_file
(
file
,
bdev
);
if
(
error
)
goto
out_putf
;
old_file
=
lo
->
lo_backing_file
;
error
=
-
EINVAL
;
/* size of the new backing store needs to be the same */
if
(
get_loop_size
(
lo
,
file
)
!=
get_loop_size
(
lo
,
old_file
))
goto
out_putf
;
/* and ... switch */
blk_mq_freeze_queue
(
lo
->
lo_queue
);
mapping_set_gfp_mask
(
old_file
->
f_mapping
,
lo
->
old_gfp_mask
);
lo
->
lo_backing_file
=
file
;
lo
->
old_gfp_mask
=
mapping_gfp_mask
(
file
->
f_mapping
);
mapping_set_gfp_mask
(
file
->
f_mapping
,
lo
->
old_gfp_mask
&
~
(
__GFP_IO
|
__GFP_FS
));
loop_update_dio
(
lo
);
blk_mq_unfreeze_queue
(
lo
->
lo_queue
);
fput
(
old_file
);
if
(
lo
->
lo_flags
&
LO_FLAGS_PARTSCAN
)
loop_reread_partitions
(
lo
,
bdev
);
return
0
;
out_putf:
fput
(
file
);
out:
return
error
;
}
/* loop sysfs attributes */
static
ssize_t
loop_attr_show
(
struct
device
*
dev
,
char
*
page
,
ssize_t
(
*
callback
)(
struct
loop_device
*
,
char
*
))
{
struct
gendisk
*
disk
=
dev_to_disk
(
dev
);
struct
loop_device
*
lo
=
disk
->
private_data
;
return
callback
(
lo
,
page
);
}
#define LOOP_ATTR_RO(_name) \
static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \
static ssize_t loop_attr_do_show_##_name(struct device *d, \
struct device_attribute *attr, char *b) \
{ \
return loop_attr_show(d, b, loop_attr_##_name##_show); \
} \
static struct device_attribute loop_attr_##_name = \
__ATTR(_name, 0444, loop_attr_do_show_##_name, NULL);
static
ssize_t
loop_attr_backing_file_show
(
struct
loop_device
*
lo
,
char
*
buf
)
{
ssize_t
ret
;
char
*
p
=
NULL
;
spin_lock_irq
(
&
lo
->
lo_lock
);
if
(
lo
->
lo_backing_file
)
p
=
file_path
(
lo
->
lo_backing_file
,
buf
,
PAGE_SIZE
-
1
);
spin_unlock_irq
(
&
lo
->
lo_lock
);
if
(
IS_ERR_OR_NULL
(
p
))
ret
=
PTR_ERR
(
p
);
else
{
ret
=
strlen
(
p
);
memmove
(
buf
,
p
,
ret
);
buf
[
ret
++
]
=
'\n'
;
buf
[
ret
]
=
0
;
}
return
ret
;
}
static
ssize_t
loop_attr_offset_show
(
struct
loop_device
*
lo
,
char
*
buf
)
{
return
sprintf
(
buf
,
"%llu
\n
"
,
(
unsigned
long
long
)
lo
->
lo_offset
);
}
static
ssize_t
loop_attr_sizelimit_show
(
struct
loop_device
*
lo
,
char
*
buf
)
{
return
sprintf
(
buf
,
"%llu
\n
"
,
(
unsigned
long
long
)
lo
->
lo_sizelimit
);
}
static
ssize_t
loop_attr_autoclear_show
(
struct
loop_device
*
lo
,
char
*
buf
)
{
int
autoclear
=
(
lo
->
lo_flags
&
LO_FLAGS_AUTOCLEAR
);
return
sprintf
(
buf
,
"%s
\n
"
,
autoclear
?
"1"
:
"0"
);
}
static
ssize_t
loop_attr_partscan_show
(
struct
loop_device
*
lo
,
char
*
buf
)
{
int
partscan
=
(
lo
->
lo_flags
&
LO_FLAGS_PARTSCAN
);
return
sprintf
(
buf
,
"%s
\n
"
,
partscan
?
"1"
:
"0"
);
}
static
ssize_t
loop_attr_dio_show
(
struct
loop_device
*
lo
,
char
*
buf
)
{
int
dio
=
(
lo
->
lo_flags
&
LO_FLAGS_DIRECT_IO
);
return
sprintf
(
buf
,
"%s
\n
"
,
dio
?
"1"
:
"0"
);
}
LOOP_ATTR_RO
(
backing_file
);
LOOP_ATTR_RO
(
offset
);
LOOP_ATTR_RO
(
sizelimit
);
LOOP_ATTR_RO
(
autoclear
);
LOOP_ATTR_RO
(
partscan
);
LOOP_ATTR_RO
(
dio
);
static
struct
attribute
*
loop_attrs
[]
=
{
&
loop_attr_backing_file
.
attr
,
&
loop_attr_offset
.
attr
,
&
loop_attr_sizelimit
.
attr
,
&
loop_attr_autoclear
.
attr
,
&
loop_attr_partscan
.
attr
,
&
loop_attr_dio
.
attr
,
NULL
,
};
static
struct
attribute_group
loop_attribute_group
=
{
.
name
=
"loop"
,
.
attrs
=
loop_attrs
,
};
static
void
loop_sysfs_init
(
struct
loop_device
*
lo
)
{
lo
->
sysfs_inited
=
!
sysfs_create_group
(
&
disk_to_dev
(
lo
->
lo_disk
)
->
kobj
,
&
loop_attribute_group
);
}
static
void
loop_sysfs_exit
(
struct
loop_device
*
lo
)
{
if
(
lo
->
sysfs_inited
)
sysfs_remove_group
(
&
disk_to_dev
(
lo
->
lo_disk
)
->
kobj
,
&
loop_attribute_group
);
}
static
void
loop_config_discard
(
struct
loop_device
*
lo
)
{
struct
file
*
file
=
lo
->
lo_backing_file
;
struct
inode
*
inode
=
file
->
f_mapping
->
host
;
struct
request_queue
*
q
=
lo
->
lo_queue
;
/*
* We use punch hole to reclaim the free space used by the
* image a.k.a. discard. However we do not support discard if
* encryption is enabled, because it may give an attacker
* useful information.
*/
if
((
!
file
->
f_op
->
fallocate
)
||
lo
->
lo_encrypt_key_size
)
{
q
->
limits
.
discard_granularity
=
0
;
q
->
limits
.
discard_alignment
=
0
;
blk_queue_max_discard_sectors
(
q
,
0
);
blk_queue_max_write_zeroes_sectors
(
q
,
0
);
blk_queue_flag_clear
(
QUEUE_FLAG_DISCARD
,
q
);
return
;
}
q
->
limits
.
discard_granularity
=
inode
->
i_sb
->
s_blocksize
;
q
->
limits
.
discard_alignment
=
0
;
blk_queue_max_discard_sectors
(
q
,
UINT_MAX
>>
9
);
blk_queue_max_write_zeroes_sectors
(
q
,
UINT_MAX
>>
9
);
blk_queue_flag_set
(
QUEUE_FLAG_DISCARD
,
q
);
}
static
void
loop_unprepare_queue
(
struct
loop_device
*
lo
)
{
kthread_flush_worker
(
&
lo
->
worker
);
kthread_stop
(
lo
->
worker_task
);
}
static
int
loop_kthread_worker_fn
(
void
*
worker_ptr
)
{
current
->
flags
|=
PF_LESS_THROTTLE
;
return
kthread_worker_fn
(
worker_ptr
);
}
static
int
loop_prepare_queue
(
struct
loop_device
*
lo
)
{
kthread_init_worker
(
&
lo
->
worker
);
lo
->
worker_task
=
kthread_run
(
loop_kthread_worker_fn
,
&
lo
->
worker
,
"loop%d"
,
lo
->
lo_number
);
if
(
IS_ERR
(
lo
->
worker_task
))
return
-
ENOMEM
;
set_user_nice
(
lo
->
worker_task
,
MIN_NICE
);
return
0
;
}
static
int
loop_set_fd
(
struct
loop_device
*
lo
,
fmode_t
mode
,
struct
block_device
*
bdev
,
unsigned
int
arg
)
{
struct
file
*
file
;
struct
inode
*
inode
;
struct
address_space
*
mapping
;
int
lo_flags
=
0
;
int
error
;
loff_t
size
;
/* This is safe, since we have a reference from open(). */
__module_get
(
THIS_MODULE
);
error
=
-
EBADF
;
file
=
fget
(
arg
);
if
(
!
file
)
goto
out
;
error
=
-
EBUSY
;
if
(
lo
->
lo_state
!=
Lo_unbound
)
goto
out_putf
;
error
=
loop_validate_file
(
file
,
bdev
);
if
(
error
)
goto
out_putf
;
mapping
=
file
->
f_mapping
;
inode
=
mapping
->
host
;
if
(
!
(
file
->
f_mode
&
FMODE_WRITE
)
||
!
(
mode
&
FMODE_WRITE
)
||
!
file
->
f_op
->
write_iter
)
lo_flags
|=
LO_FLAGS_READ_ONLY
;
error
=
-
EFBIG
;
size
=
get_loop_size
(
lo
,
file
);
if
((
loff_t
)(
sector_t
)
size
!=
size
)
goto
out_putf
;
error
=
loop_prepare_queue
(
lo
);
if
(
error
)
goto
out_putf
;
error
=
0
;
set_device_ro
(
bdev
,
(
lo_flags
&
LO_FLAGS_READ_ONLY
)
!=
0
);
lo
->
use_dio
=
false
;
lo
->
lo_device
=
bdev
;
lo
->
lo_flags
=
lo_flags
;
lo
->
lo_backing_file
=
file
;
lo
->
transfer
=
NULL
;
lo
->
ioctl
=
NULL
;
lo
->
lo_sizelimit
=
0
;
lo
->
old_gfp_mask
=
mapping_gfp_mask
(
mapping
);
mapping_set_gfp_mask
(
mapping
,
lo
->
old_gfp_mask
&
~
(
__GFP_IO
|
__GFP_FS
));
if
(
!
(
lo_flags
&
LO_FLAGS_READ_ONLY
)
&&
file
->
f_op
->
fsync
)
blk_queue_write_cache
(
lo
->
lo_queue
,
true
,
false
);
loop_update_dio
(
lo
);
set_capacity
(
lo
->
lo_disk
,
size
);
bd_set_size
(
bdev
,
size
<<
9
);
loop_sysfs_init
(
lo
);
/* let user-space know about the new size */
kobject_uevent
(
&
disk_to_dev
(
bdev
->
bd_disk
)
->
kobj
,
KOBJ_CHANGE
);
set_blocksize
(
bdev
,
S_ISBLK
(
inode
->
i_mode
)
?
block_size
(
inode
->
i_bdev
)
:
PAGE_SIZE
);
lo
->
lo_state
=
Lo_bound
;
if
(
part_shift
)
lo
->
lo_flags
|=
LO_FLAGS_PARTSCAN
;
if
(
lo
->
lo_flags
&
LO_FLAGS_PARTSCAN
)
loop_reread_partitions
(
lo
,
bdev
);
/* Grab the block_device to prevent its destruction after we
* put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
*/
bdgrab
(
bdev
);
return
0
;
out_putf:
fput
(
file
);
out:
/* This is safe: open() is still holding a reference. */
module_put
(
THIS_MODULE
);
return
error
;
}
static
int
loop_release_xfer
(
struct
loop_device
*
lo
)
{
int
err
=
0
;
struct
loop_func_table
*
xfer
=
lo
->
lo_encryption
;
if
(
xfer
)
{
if
(
xfer
->
release
)
err
=
xfer
->
release
(
lo
);
lo
->
transfer
=
NULL
;
lo
->
lo_encryption
=
NULL
;
module_put
(
xfer
->
owner
);
}
return
err
;
}
static
int
loop_init_xfer
(
struct
loop_device
*
lo
,
struct
loop_func_table
*
xfer
,
const
struct
loop_info64
*
i
)
{
int
err
=
0
;
if
(
xfer
)
{
struct
module
*
owner
=
xfer
->
owner
;
if
(
!
try_module_get
(
owner
))
return
-
EINVAL
;
if
(
xfer
->
init
)
err
=
xfer
->
init
(
lo
,
i
);
if
(
err
)
module_put
(
owner
);
else
lo
->
lo_encryption
=
xfer
;
}
return
err
;
}
static
int
loop_clr_fd
(
struct
loop_device
*
lo
)
{
struct
file
*
filp
=
lo
->
lo_backing_file
;
gfp_t
gfp
=
lo
->
old_gfp_mask
;
struct
block_device
*
bdev
=
lo
->
lo_device
;
if
(
lo
->
lo_state
!=
Lo_bound
)
return
-
ENXIO
;
/*
* If we've explicitly asked to tear down the loop device,
* and it has an elevated reference count, set it for auto-teardown when
* the last reference goes away. This stops $!~#$@ udev from
* preventing teardown because it decided that it needs to run blkid on
* the loopback device whenever they appear. xfstests is notorious for
* failing tests because blkid via udev races with a losetup
* <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
* command to fail with EBUSY.
*/
if
(
atomic_read
(
&
lo
->
lo_refcnt
)
>
1
)
{
lo
->
lo_flags
|=
LO_FLAGS_AUTOCLEAR
;
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
return
0
;
}
if
(
filp
==
NULL
)
return
-
EINVAL
;
/* freeze request queue during the transition */
blk_mq_freeze_queue
(
lo
->
lo_queue
);
spin_lock_irq
(
&
lo
->
lo_lock
);
lo
->
lo_state
=
Lo_rundown
;
lo
->
lo_backing_file
=
NULL
;
spin_unlock_irq
(
&
lo
->
lo_lock
);
loop_release_xfer
(
lo
);
lo
->
transfer
=
NULL
;
lo
->
ioctl
=
NULL
;
lo
->
lo_device
=
NULL
;
lo
->
lo_encryption
=
NULL
;
lo
->
lo_offset
=
0
;
lo
->
lo_sizelimit
=
0
;
lo
->
lo_encrypt_key_size
=
0
;
memset
(
lo
->
lo_encrypt_key
,
0
,
LO_KEY_SIZE
);
memset
(
lo
->
lo_crypt_name
,
0
,
LO_NAME_SIZE
);
memset
(
lo
->
lo_file_name
,
0
,
LO_NAME_SIZE
);
blk_queue_logical_block_size
(
lo
->
lo_queue
,
512
);
blk_queue_physical_block_size
(
lo
->
lo_queue
,
512
);
blk_queue_io_min
(
lo
->
lo_queue
,
512
);
if
(
bdev
)
{
bdput
(
bdev
);
invalidate_bdev
(
bdev
);
bdev
->
bd_inode
->
i_mapping
->
wb_err
=
0
;
}
set_capacity
(
lo
->
lo_disk
,
0
);
loop_sysfs_exit
(
lo
);
if
(
bdev
)
{
bd_set_size
(
bdev
,
0
);
/* let user-space know about this change */
kobject_uevent
(
&
disk_to_dev
(
bdev
->
bd_disk
)
->
kobj
,
KOBJ_CHANGE
);
}
mapping_set_gfp_mask
(
filp
->
f_mapping
,
gfp
);
lo
->
lo_state
=
Lo_unbound
;
/* This is safe: open() is still holding a reference. */
module_put
(
THIS_MODULE
);
blk_mq_unfreeze_queue
(
lo
->
lo_queue
);
if
(
lo
->
lo_flags
&
LO_FLAGS_PARTSCAN
&&
bdev
)
loop_reread_partitions
(
lo
,
bdev
);
lo
->
lo_flags
=
0
;
if
(
!
part_shift
)
lo
->
lo_disk
->
flags
|=
GENHD_FL_NO_PART_SCAN
;
loop_unprepare_queue
(
lo
);
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
/*
* Need not hold lo_ctl_mutex to fput backing file.
* Calling fput holding lo_ctl_mutex triggers a circular
* lock dependency possibility warning as fput can take
* bd_mutex which is usually taken before lo_ctl_mutex.
*/
fput
(
filp
);
return
0
;
}
static
int
loop_set_status
(
struct
loop_device
*
lo
,
const
struct
loop_info64
*
info
)
{
int
err
;
struct
loop_func_table
*
xfer
;
kuid_t
uid
=
current_uid
();
if
(
lo
->
lo_encrypt_key_size
&&
!
uid_eq
(
lo
->
lo_key_owner
,
uid
)
&&
!
capable
(
CAP_SYS_ADMIN
))
return
-
EPERM
;
if
(
lo
->
lo_state
!=
Lo_bound
)
return
-
ENXIO
;
if
((
unsigned
int
)
info
->
lo_encrypt_key_size
>
LO_KEY_SIZE
)
return
-
EINVAL
;
/* I/O need to be drained during transfer transition */
blk_mq_freeze_queue
(
lo
->
lo_queue
);
err
=
loop_release_xfer
(
lo
);
if
(
err
)
goto
exit
;
if
(
info
->
lo_encrypt_type
)
{
unsigned
int
type
=
info
->
lo_encrypt_type
;
if
(
type
>=
MAX_LO_CRYPT
)
{
err
=
-
EINVAL
;
goto
exit
;
}
xfer
=
xfer_funcs
[
type
];
if
(
xfer
==
NULL
)
{
err
=
-
EINVAL
;
goto
exit
;
}
}
else
xfer
=
NULL
;
err
=
loop_init_xfer
(
lo
,
xfer
,
info
);
if
(
err
)
goto
exit
;
if
(
lo
->
lo_offset
!=
info
->
lo_offset
||
lo
->
lo_sizelimit
!=
info
->
lo_sizelimit
)
{
if
(
figure_loop_size
(
lo
,
info
->
lo_offset
,
info
->
lo_sizelimit
))
{
err
=
-
EFBIG
;
goto
exit
;
}
}
loop_config_discard
(
lo
);
memcpy
(
lo
->
lo_file_name
,
info
->
lo_file_name
,
LO_NAME_SIZE
);
memcpy
(
lo
->
lo_crypt_name
,
info
->
lo_crypt_name
,
LO_NAME_SIZE
);
lo
->
lo_file_name
[
LO_NAME_SIZE
-
1
]
=
0
;
lo
->
lo_crypt_name
[
LO_NAME_SIZE
-
1
]
=
0
;
if
(
!
xfer
)
xfer
=
&
none_funcs
;
lo
->
transfer
=
xfer
->
transfer
;
lo
->
ioctl
=
xfer
->
ioctl
;
if
((
lo
->
lo_flags
&
LO_FLAGS_AUTOCLEAR
)
!=
(
info
->
lo_flags
&
LO_FLAGS_AUTOCLEAR
))
lo
->
lo_flags
^=
LO_FLAGS_AUTOCLEAR
;
lo
->
lo_encrypt_key_size
=
info
->
lo_encrypt_key_size
;
lo
->
lo_init
[
0
]
=
info
->
lo_init
[
0
];
lo
->
lo_init
[
1
]
=
info
->
lo_init
[
1
];
if
(
info
->
lo_encrypt_key_size
)
{
memcpy
(
lo
->
lo_encrypt_key
,
info
->
lo_encrypt_key
,
info
->
lo_encrypt_key_size
);
lo
->
lo_key_owner
=
uid
;
}
/* update dio if lo_offset or transfer is changed */
__loop_update_dio
(
lo
,
lo
->
use_dio
);
exit:
blk_mq_unfreeze_queue
(
lo
->
lo_queue
);
if
(
!
err
&&
(
info
->
lo_flags
&
LO_FLAGS_PARTSCAN
)
&&
!
(
lo
->
lo_flags
&
LO_FLAGS_PARTSCAN
))
{
lo
->
lo_flags
|=
LO_FLAGS_PARTSCAN
;
lo
->
lo_disk
->
flags
&=
~
GENHD_FL_NO_PART_SCAN
;
loop_reread_partitions
(
lo
,
lo
->
lo_device
);
}
return
err
;
}
static
int
loop_get_status
(
struct
loop_device
*
lo
,
struct
loop_info64
*
info
)
{
struct
file
*
file
;
struct
kstat
stat
;
int
ret
;
if
(
lo
->
lo_state
!=
Lo_bound
)
{
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
return
-
ENXIO
;
}
memset
(
info
,
0
,
sizeof
(
*
info
));
info
->
lo_number
=
lo
->
lo_number
;
info
->
lo_offset
=
lo
->
lo_offset
;
info
->
lo_sizelimit
=
lo
->
lo_sizelimit
;
info
->
lo_flags
=
lo
->
lo_flags
;
memcpy
(
info
->
lo_file_name
,
lo
->
lo_file_name
,
LO_NAME_SIZE
);
memcpy
(
info
->
lo_crypt_name
,
lo
->
lo_crypt_name
,
LO_NAME_SIZE
);
info
->
lo_encrypt_type
=
lo
->
lo_encryption
?
lo
->
lo_encryption
->
number
:
0
;
if
(
lo
->
lo_encrypt_key_size
&&
capable
(
CAP_SYS_ADMIN
))
{
info
->
lo_encrypt_key_size
=
lo
->
lo_encrypt_key_size
;
memcpy
(
info
->
lo_encrypt_key
,
lo
->
lo_encrypt_key
,
lo
->
lo_encrypt_key_size
);
}
/* Drop lo_ctl_mutex while we call into the filesystem. */
file
=
get_file
(
lo
->
lo_backing_file
);
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
ret
=
vfs_getattr
(
&
file
->
f_path
,
&
stat
,
STATX_INO
,
AT_STATX_SYNC_AS_STAT
);
if
(
!
ret
)
{
info
->
lo_device
=
huge_encode_dev
(
stat
.
dev
);
info
->
lo_inode
=
stat
.
ino
;
info
->
lo_rdevice
=
huge_encode_dev
(
stat
.
rdev
);
}
fput
(
file
);
return
ret
;
}
static
void
loop_info64_from_old
(
const
struct
loop_info
*
info
,
struct
loop_info64
*
info64
)
{
memset
(
info64
,
0
,
sizeof
(
*
info64
));
info64
->
lo_number
=
info
->
lo_number
;
info64
->
lo_device
=
info
->
lo_device
;
info64
->
lo_inode
=
info
->
lo_inode
;
info64
->
lo_rdevice
=
info
->
lo_rdevice
;
info64
->
lo_offset
=
info
->
lo_offset
;
info64
->
lo_sizelimit
=
0
;
info64
->
lo_encrypt_type
=
info
->
lo_encrypt_type
;
info64
->
lo_encrypt_key_size
=
info
->
lo_encrypt_key_size
;
info64
->
lo_flags
=
info
->
lo_flags
;
info64
->
lo_init
[
0
]
=
info
->
lo_init
[
0
];
info64
->
lo_init
[
1
]
=
info
->
lo_init
[
1
];
if
(
info
->
lo_encrypt_type
==
LO_CRYPT_CRYPTOAPI
)
memcpy
(
info64
->
lo_crypt_name
,
info
->
lo_name
,
LO_NAME_SIZE
);
else
memcpy
(
info64
->
lo_file_name
,
info
->
lo_name
,
LO_NAME_SIZE
);
memcpy
(
info64
->
lo_encrypt_key
,
info
->
lo_encrypt_key
,
LO_KEY_SIZE
);
}
static
int
loop_info64_to_old
(
const
struct
loop_info64
*
info64
,
struct
loop_info
*
info
)
{
memset
(
info
,
0
,
sizeof
(
*
info
));
info
->
lo_number
=
info64
->
lo_number
;
info
->
lo_device
=
info64
->
lo_device
;
info
->
lo_inode
=
info64
->
lo_inode
;
info
->
lo_rdevice
=
info64
->
lo_rdevice
;
info
->
lo_offset
=
info64
->
lo_offset
;
info
->
lo_encrypt_type
=
info64
->
lo_encrypt_type
;
info
->
lo_encrypt_key_size
=
info64
->
lo_encrypt_key_size
;
info
->
lo_flags
=
info64
->
lo_flags
;
info
->
lo_init
[
0
]
=
info64
->
lo_init
[
0
];
info
->
lo_init
[
1
]
=
info64
->
lo_init
[
1
];
if
(
info
->
lo_encrypt_type
==
LO_CRYPT_CRYPTOAPI
)
memcpy
(
info
->
lo_name
,
info64
->
lo_crypt_name
,
LO_NAME_SIZE
);
else
memcpy
(
info
->
lo_name
,
info64
->
lo_file_name
,
LO_NAME_SIZE
);
memcpy
(
info
->
lo_encrypt_key
,
info64
->
lo_encrypt_key
,
LO_KEY_SIZE
);
/* error in case values were truncated */
if
(
info
->
lo_device
!=
info64
->
lo_device
||
info
->
lo_rdevice
!=
info64
->
lo_rdevice
||
info
->
lo_inode
!=
info64
->
lo_inode
||
info
->
lo_offset
!=
info64
->
lo_offset
)
return
-
EOVERFLOW
;
return
0
;
}
static
int
loop_set_status_old
(
struct
loop_device
*
lo
,
const
struct
loop_info
__user
*
arg
)
{
struct
loop_info
info
;
struct
loop_info64
info64
;
if
(
copy_from_user
(
&
info
,
arg
,
sizeof
(
struct
loop_info
)))
return
-
EFAULT
;
loop_info64_from_old
(
&
info
,
&
info64
);
return
loop_set_status
(
lo
,
&
info64
);
}
static
int
loop_set_status64
(
struct
loop_device
*
lo
,
const
struct
loop_info64
__user
*
arg
)
{
struct
loop_info64
info64
;
if
(
copy_from_user
(
&
info64
,
arg
,
sizeof
(
struct
loop_info64
)))
return
-
EFAULT
;
return
loop_set_status
(
lo
,
&
info64
);
}
static
int
loop_get_status_old
(
struct
loop_device
*
lo
,
struct
loop_info
__user
*
arg
)
{
struct
loop_info
info
;
struct
loop_info64
info64
;
int
err
;
if
(
!
arg
)
{
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
return
-
EINVAL
;
}
err
=
loop_get_status
(
lo
,
&
info64
);
if
(
!
err
)
err
=
loop_info64_to_old
(
&
info64
,
&
info
);
if
(
!
err
&&
copy_to_user
(
arg
,
&
info
,
sizeof
(
info
)))
err
=
-
EFAULT
;
return
err
;
}
static
int
loop_get_status64
(
struct
loop_device
*
lo
,
struct
loop_info64
__user
*
arg
)
{
struct
loop_info64
info64
;
int
err
;
if
(
!
arg
)
{
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
return
-
EINVAL
;
}
err
=
loop_get_status
(
lo
,
&
info64
);
if
(
!
err
&&
copy_to_user
(
arg
,
&
info64
,
sizeof
(
info64
)))
err
=
-
EFAULT
;
return
err
;
}
static
int
loop_set_capacity
(
struct
loop_device
*
lo
)
{
if
(
unlikely
(
lo
->
lo_state
!=
Lo_bound
))
return
-
ENXIO
;
return
figure_loop_size
(
lo
,
lo
->
lo_offset
,
lo
->
lo_sizelimit
);
}
static
int
loop_set_dio
(
struct
loop_device
*
lo
,
unsigned
long
arg
)
{
int
error
=
-
ENXIO
;
if
(
lo
->
lo_state
!=
Lo_bound
)
goto
out
;
__loop_update_dio
(
lo
,
!!
arg
);
if
(
lo
->
use_dio
==
!!
arg
)
return
0
;
error
=
-
EINVAL
;
out:
return
error
;
}
static
int
loop_set_block_size
(
struct
loop_device
*
lo
,
unsigned
long
arg
)
{
if
(
lo
->
lo_state
!=
Lo_bound
)
return
-
ENXIO
;
if
(
arg
<
512
||
arg
>
PAGE_SIZE
||
!
is_power_of_2
(
arg
))
return
-
EINVAL
;
blk_mq_freeze_queue
(
lo
->
lo_queue
);
blk_queue_logical_block_size
(
lo
->
lo_queue
,
arg
);
blk_queue_physical_block_size
(
lo
->
lo_queue
,
arg
);
blk_queue_io_min
(
lo
->
lo_queue
,
arg
);
loop_update_dio
(
lo
);
blk_mq_unfreeze_queue
(
lo
->
lo_queue
);
return
0
;
}
static
int
lo_ioctl
(
struct
block_device
*
bdev
,
fmode_t
mode
,
unsigned
int
cmd
,
unsigned
long
arg
)
{
struct
loop_device
*
lo
=
bdev
->
bd_disk
->
private_data
;
int
err
;
err
=
mutex_lock_killable_nested
(
&
lo
->
lo_ctl_mutex
,
1
);
if
(
err
)
goto
out_unlocked
;
switch
(
cmd
)
{
case
LOOP_SET_FD
:
err
=
loop_set_fd
(
lo
,
mode
,
bdev
,
arg
);
break
;
case
LOOP_CHANGE_FD
:
err
=
loop_change_fd
(
lo
,
bdev
,
arg
);
break
;
case
LOOP_CLR_FD
:
/* loop_clr_fd would have unlocked lo_ctl_mutex on success */
err
=
loop_clr_fd
(
lo
);
if
(
!
err
)
goto
out_unlocked
;
break
;
case
LOOP_SET_STATUS
:
err
=
-
EPERM
;
if
((
mode
&
FMODE_WRITE
)
||
capable
(
CAP_SYS_ADMIN
))
err
=
loop_set_status_old
(
lo
,
(
struct
loop_info
__user
*
)
arg
);
break
;
case
LOOP_GET_STATUS
:
err
=
loop_get_status_old
(
lo
,
(
struct
loop_info
__user
*
)
arg
);
/* loop_get_status() unlocks lo_ctl_mutex */
goto
out_unlocked
;
case
LOOP_SET_STATUS64
:
err
=
-
EPERM
;
if
((
mode
&
FMODE_WRITE
)
||
capable
(
CAP_SYS_ADMIN
))
err
=
loop_set_status64
(
lo
,
(
struct
loop_info64
__user
*
)
arg
);
break
;
case
LOOP_GET_STATUS64
:
err
=
loop_get_status64
(
lo
,
(
struct
loop_info64
__user
*
)
arg
);
/* loop_get_status() unlocks lo_ctl_mutex */
goto
out_unlocked
;
case
LOOP_SET_CAPACITY
:
err
=
-
EPERM
;
if
((
mode
&
FMODE_WRITE
)
||
capable
(
CAP_SYS_ADMIN
))
err
=
loop_set_capacity
(
lo
);
break
;
case
LOOP_SET_DIRECT_IO
:
err
=
-
EPERM
;
if
((
mode
&
FMODE_WRITE
)
||
capable
(
CAP_SYS_ADMIN
))
err
=
loop_set_dio
(
lo
,
arg
);
break
;
case
LOOP_SET_BLOCK_SIZE
:
err
=
-
EPERM
;
if
((
mode
&
FMODE_WRITE
)
||
capable
(
CAP_SYS_ADMIN
))
err
=
loop_set_block_size
(
lo
,
arg
);
break
;
default:
err
=
lo
->
ioctl
?
lo
->
ioctl
(
lo
,
cmd
,
arg
)
:
-
EINVAL
;
}
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
out_unlocked:
return
err
;
}
#ifdef CONFIG_COMPAT
struct
compat_loop_info
{
compat_int_t
lo_number
;
/* ioctl r/o */
compat_dev_t
lo_device
;
/* ioctl r/o */
compat_ulong_t
lo_inode
;
/* ioctl r/o */
compat_dev_t
lo_rdevice
;
/* ioctl r/o */
compat_int_t
lo_offset
;
compat_int_t
lo_encrypt_type
;
compat_int_t
lo_encrypt_key_size
;
/* ioctl w/o */
compat_int_t
lo_flags
;
/* ioctl r/o */
char
lo_name
[
LO_NAME_SIZE
];
unsigned
char
lo_encrypt_key
[
LO_KEY_SIZE
];
/* ioctl w/o */
compat_ulong_t
lo_init
[
2
];
char
reserved
[
4
];
};
/*
* Transfer 32-bit compatibility structure in userspace to 64-bit loop info
* - noinlined to reduce stack space usage in main part of driver
*/
static
noinline
int
loop_info64_from_compat
(
const
struct
compat_loop_info
__user
*
arg
,
struct
loop_info64
*
info64
)
{
struct
compat_loop_info
info
;
if
(
copy_from_user
(
&
info
,
arg
,
sizeof
(
info
)))
return
-
EFAULT
;
memset
(
info64
,
0
,
sizeof
(
*
info64
));
info64
->
lo_number
=
info
.
lo_number
;
info64
->
lo_device
=
info
.
lo_device
;
info64
->
lo_inode
=
info
.
lo_inode
;
info64
->
lo_rdevice
=
info
.
lo_rdevice
;
info64
->
lo_offset
=
info
.
lo_offset
;
info64
->
lo_sizelimit
=
0
;
info64
->
lo_encrypt_type
=
info
.
lo_encrypt_type
;
info64
->
lo_encrypt_key_size
=
info
.
lo_encrypt_key_size
;
info64
->
lo_flags
=
info
.
lo_flags
;
info64
->
lo_init
[
0
]
=
info
.
lo_init
[
0
];
info64
->
lo_init
[
1
]
=
info
.
lo_init
[
1
];
if
(
info
.
lo_encrypt_type
==
LO_CRYPT_CRYPTOAPI
)
memcpy
(
info64
->
lo_crypt_name
,
info
.
lo_name
,
LO_NAME_SIZE
);
else
memcpy
(
info64
->
lo_file_name
,
info
.
lo_name
,
LO_NAME_SIZE
);
memcpy
(
info64
->
lo_encrypt_key
,
info
.
lo_encrypt_key
,
LO_KEY_SIZE
);
return
0
;
}
/*
* Transfer 64-bit loop info to 32-bit compatibility structure in userspace
* - noinlined to reduce stack space usage in main part of driver
*/
static
noinline
int
loop_info64_to_compat
(
const
struct
loop_info64
*
info64
,
struct
compat_loop_info
__user
*
arg
)
{
struct
compat_loop_info
info
;
memset
(
&
info
,
0
,
sizeof
(
info
));
info
.
lo_number
=
info64
->
lo_number
;
info
.
lo_device
=
info64
->
lo_device
;
info
.
lo_inode
=
info64
->
lo_inode
;
info
.
lo_rdevice
=
info64
->
lo_rdevice
;
info
.
lo_offset
=
info64
->
lo_offset
;
info
.
lo_encrypt_type
=
info64
->
lo_encrypt_type
;
info
.
lo_encrypt_key_size
=
info64
->
lo_encrypt_key_size
;
info
.
lo_flags
=
info64
->
lo_flags
;
info
.
lo_init
[
0
]
=
info64
->
lo_init
[
0
];
info
.
lo_init
[
1
]
=
info64
->
lo_init
[
1
];
if
(
info
.
lo_encrypt_type
==
LO_CRYPT_CRYPTOAPI
)
memcpy
(
info
.
lo_name
,
info64
->
lo_crypt_name
,
LO_NAME_SIZE
);
else
memcpy
(
info
.
lo_name
,
info64
->
lo_file_name
,
LO_NAME_SIZE
);
memcpy
(
info
.
lo_encrypt_key
,
info64
->
lo_encrypt_key
,
LO_KEY_SIZE
);
/* error in case values were truncated */
if
(
info
.
lo_device
!=
info64
->
lo_device
||
info
.
lo_rdevice
!=
info64
->
lo_rdevice
||
info
.
lo_inode
!=
info64
->
lo_inode
||
info
.
lo_offset
!=
info64
->
lo_offset
||
info
.
lo_init
[
0
]
!=
info64
->
lo_init
[
0
]
||
info
.
lo_init
[
1
]
!=
info64
->
lo_init
[
1
])
return
-
EOVERFLOW
;
if
(
copy_to_user
(
arg
,
&
info
,
sizeof
(
info
)))
return
-
EFAULT
;
return
0
;
}
static
int
loop_set_status_compat
(
struct
loop_device
*
lo
,
const
struct
compat_loop_info
__user
*
arg
)
{
struct
loop_info64
info64
;
int
ret
;
ret
=
loop_info64_from_compat
(
arg
,
&
info64
);
if
(
ret
<
0
)
return
ret
;
return
loop_set_status
(
lo
,
&
info64
);
}
static
int
loop_get_status_compat
(
struct
loop_device
*
lo
,
struct
compat_loop_info
__user
*
arg
)
{
struct
loop_info64
info64
;
int
err
;
if
(
!
arg
)
{
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
return
-
EINVAL
;
}
err
=
loop_get_status
(
lo
,
&
info64
);
if
(
!
err
)
err
=
loop_info64_to_compat
(
&
info64
,
arg
);
return
err
;
}
static
int
lo_compat_ioctl
(
struct
block_device
*
bdev
,
fmode_t
mode
,
unsigned
int
cmd
,
unsigned
long
arg
)
{
struct
loop_device
*
lo
=
bdev
->
bd_disk
->
private_data
;
int
err
;
switch
(
cmd
)
{
case
LOOP_SET_STATUS
:
err
=
mutex_lock_killable
(
&
lo
->
lo_ctl_mutex
);
if
(
!
err
)
{
err
=
loop_set_status_compat
(
lo
,
(
const
struct
compat_loop_info
__user
*
)
arg
);
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
}
break
;
case
LOOP_GET_STATUS
:
err
=
mutex_lock_killable
(
&
lo
->
lo_ctl_mutex
);
if
(
!
err
)
{
err
=
loop_get_status_compat
(
lo
,
(
struct
compat_loop_info
__user
*
)
arg
);
/* loop_get_status() unlocks lo_ctl_mutex */
}
break
;
case
LOOP_SET_CAPACITY
:
case
LOOP_CLR_FD
:
case
LOOP_GET_STATUS64
:
case
LOOP_SET_STATUS64
:
arg
=
(
unsigned
long
)
compat_ptr
(
arg
);
/* fall through */
case
LOOP_SET_FD
:
case
LOOP_CHANGE_FD
:
case
LOOP_SET_BLOCK_SIZE
:
err
=
lo_ioctl
(
bdev
,
mode
,
cmd
,
arg
);
break
;
default:
err
=
-
ENOIOCTLCMD
;
break
;
}
return
err
;
}
#endif
static
int
lo_open
(
struct
block_device
*
bdev
,
fmode_t
mode
)
{
struct
loop_device
*
lo
;
int
err
=
0
;
mutex_lock
(
&
loop_index_mutex
);
lo
=
bdev
->
bd_disk
->
private_data
;
if
(
!
lo
)
{
err
=
-
ENXIO
;
goto
out
;
}
atomic_inc
(
&
lo
->
lo_refcnt
);
out:
mutex_unlock
(
&
loop_index_mutex
);
return
err
;
}
static
void
__lo_release
(
struct
loop_device
*
lo
)
{
int
err
;
if
(
atomic_dec_return
(
&
lo
->
lo_refcnt
))
return
;
mutex_lock
(
&
lo
->
lo_ctl_mutex
);
if
(
lo
->
lo_flags
&
LO_FLAGS_AUTOCLEAR
)
{
/*
* In autoclear mode, stop the loop thread
* and remove configuration after last close.
*/
err
=
loop_clr_fd
(
lo
);
if
(
!
err
)
return
;
}
else
if
(
lo
->
lo_state
==
Lo_bound
)
{
/*
* Otherwise keep thread (if running) and config,
* but flush possible ongoing bios in thread.
*/
blk_mq_freeze_queue
(
lo
->
lo_queue
);
blk_mq_unfreeze_queue
(
lo
->
lo_queue
);
}
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
}
static
void
lo_release
(
struct
gendisk
*
disk
,
fmode_t
mode
)
{
mutex_lock
(
&
loop_index_mutex
);
__lo_release
(
disk
->
private_data
);
mutex_unlock
(
&
loop_index_mutex
);
}
static
const
struct
block_device_operations
lo_fops
=
{
.
owner
=
THIS_MODULE
,
.
open
=
lo_open
,
.
release
=
lo_release
,
.
ioctl
=
lo_ioctl
,
#ifdef CONFIG_COMPAT
.
compat_ioctl
=
lo_compat_ioctl
,
#endif
};
/*
* And now the modules code and kernel interface.
*/
static
int
max_loop
;
module_param
(
max_loop
,
int
,
0444
);
MODULE_PARM_DESC
(
max_loop
,
"Maximum number of loop devices"
);
module_param
(
max_part
,
int
,
0444
);
MODULE_PARM_DESC
(
max_part
,
"Maximum number of partitions per loop device"
);
MODULE_LICENSE
(
"GPL"
);
MODULE_ALIAS_BLOCKDEV_MAJOR
(
LOOP_MAJOR
);
int
loop_register_transfer
(
struct
loop_func_table
*
funcs
)
{
unsigned
int
n
=
funcs
->
number
;
if
(
n
>=
MAX_LO_CRYPT
||
xfer_funcs
[
n
])
return
-
EINVAL
;
xfer_funcs
[
n
]
=
funcs
;
return
0
;
}
static
int
unregister_transfer_cb
(
int
id
,
void
*
ptr
,
void
*
data
)
{
struct
loop_device
*
lo
=
ptr
;
struct
loop_func_table
*
xfer
=
data
;
mutex_lock
(
&
lo
->
lo_ctl_mutex
);
if
(
lo
->
lo_encryption
==
xfer
)
loop_release_xfer
(
lo
);
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
return
0
;
}
int
loop_unregister_transfer
(
int
number
)
{
unsigned
int
n
=
number
;
struct
loop_func_table
*
xfer
;
if
(
n
==
0
||
n
>=
MAX_LO_CRYPT
||
(
xfer
=
xfer_funcs
[
n
])
==
NULL
)
return
-
EINVAL
;
xfer_funcs
[
n
]
=
NULL
;
idr_for_each
(
&
loop_index_idr
,
&
unregister_transfer_cb
,
xfer
);
return
0
;
}
EXPORT_SYMBOL
(
loop_register_transfer
);
EXPORT_SYMBOL
(
loop_unregister_transfer
);
static
blk_status_t
loop_queue_rq
(
struct
blk_mq_hw_ctx
*
hctx
,
const
struct
blk_mq_queue_data
*
bd
)
{
struct
request
*
rq
=
bd
->
rq
;
struct
loop_cmd
*
cmd
=
blk_mq_rq_to_pdu
(
rq
);
struct
loop_device
*
lo
=
rq
->
q
->
queuedata
;
blk_mq_start_request
(
rq
);
if
(
lo
->
lo_state
!=
Lo_bound
)
return
BLK_STS_IOERR
;
switch
(
req_op
(
rq
))
{
case
REQ_OP_FLUSH
:
case
REQ_OP_DISCARD
:
case
REQ_OP_WRITE_ZEROES
:
cmd
->
use_aio
=
false
;
break
;
default:
cmd
->
use_aio
=
lo
->
use_dio
;
break
;
}
/* always use the first bio's css */
#ifdef CONFIG_BLK_CGROUP
if
(
cmd
->
use_aio
&&
rq
->
bio
&&
rq
->
bio
->
bi_css
)
{
cmd
->
css
=
rq
->
bio
->
bi_css
;
css_get
(
cmd
->
css
);
}
else
#endif
cmd
->
css
=
NULL
;
kthread_queue_work
(
&
lo
->
worker
,
&
cmd
->
work
);
return
BLK_STS_OK
;
}
static
void
loop_handle_cmd
(
struct
loop_cmd
*
cmd
)
{
struct
request
*
rq
=
blk_mq_rq_from_pdu
(
cmd
);
const
bool
write
=
op_is_write
(
req_op
(
rq
));
struct
loop_device
*
lo
=
rq
->
q
->
queuedata
;
int
ret
=
0
;
if
(
write
&&
(
lo
->
lo_flags
&
LO_FLAGS_READ_ONLY
))
{
ret
=
-
EIO
;
goto
failed
;
}
ret
=
do_req_filebacked
(
lo
,
rq
);
failed:
/* complete non-aio request */
if
(
!
cmd
->
use_aio
||
ret
)
{
cmd
->
ret
=
ret
?
-
EIO
:
0
;
blk_mq_complete_request
(
rq
);
}
}
static
void
loop_queue_work
(
struct
kthread_work
*
work
)
{
struct
loop_cmd
*
cmd
=
container_of
(
work
,
struct
loop_cmd
,
work
);
loop_handle_cmd
(
cmd
);
}
static
int
loop_init_request
(
struct
blk_mq_tag_set
*
set
,
struct
request
*
rq
,
unsigned
int
hctx_idx
,
unsigned
int
numa_node
)
{
struct
loop_cmd
*
cmd
=
blk_mq_rq_to_pdu
(
rq
);
kthread_init_work
(
&
cmd
->
work
,
loop_queue_work
);
return
0
;
}
static
const
struct
blk_mq_ops
loop_mq_ops
=
{
.
queue_rq
=
loop_queue_rq
,
.
init_request
=
loop_init_request
,
.
complete
=
lo_complete_rq
,
};
static
int
loop_add
(
struct
loop_device
**
l
,
int
i
)
{
struct
loop_device
*
lo
;
struct
gendisk
*
disk
;
int
err
;
err
=
-
ENOMEM
;
lo
=
kzalloc
(
sizeof
(
*
lo
),
GFP_KERNEL
);
if
(
!
lo
)
goto
out
;
lo
->
lo_state
=
Lo_unbound
;
/* allocate id, if @id >= 0, we're requesting that specific id */
if
(
i
>=
0
)
{
err
=
idr_alloc
(
&
loop_index_idr
,
lo
,
i
,
i
+
1
,
GFP_KERNEL
);
if
(
err
==
-
ENOSPC
)
err
=
-
EEXIST
;
}
else
{
err
=
idr_alloc
(
&
loop_index_idr
,
lo
,
0
,
0
,
GFP_KERNEL
);
}
if
(
err
<
0
)
goto
out_free_dev
;
i
=
err
;
err
=
-
ENOMEM
;
lo
->
tag_set
.
ops
=
&
loop_mq_ops
;
lo
->
tag_set
.
nr_hw_queues
=
1
;
lo
->
tag_set
.
queue_depth
=
128
;
lo
->
tag_set
.
numa_node
=
NUMA_NO_NODE
;
lo
->
tag_set
.
cmd_size
=
sizeof
(
struct
loop_cmd
);
lo
->
tag_set
.
flags
=
BLK_MQ_F_SHOULD_MERGE
|
BLK_MQ_F_SG_MERGE
;
lo
->
tag_set
.
driver_data
=
lo
;
err
=
blk_mq_alloc_tag_set
(
&
lo
->
tag_set
);
if
(
err
)
goto
out_free_idr
;
lo
->
lo_queue
=
blk_mq_init_queue
(
&
lo
->
tag_set
);
if
(
IS_ERR_OR_NULL
(
lo
->
lo_queue
))
{
err
=
PTR_ERR
(
lo
->
lo_queue
);
goto
out_cleanup_tags
;
}
lo
->
lo_queue
->
queuedata
=
lo
;
blk_queue_max_hw_sectors
(
lo
->
lo_queue
,
BLK_DEF_MAX_SECTORS
);
/*
* By default, we do buffer IO, so it doesn't make sense to enable
* merge because the I/O submitted to backing file is handled page by
* page. For directio mode, merge does help to dispatch bigger request
* to underlayer disk. We will enable merge once directio is enabled.
*/
blk_queue_flag_set
(
QUEUE_FLAG_NOMERGES
,
lo
->
lo_queue
);
err
=
-
ENOMEM
;
disk
=
lo
->
lo_disk
=
alloc_disk
(
1
<<
part_shift
);
if
(
!
disk
)
goto
out_free_queue
;
/*
* Disable partition scanning by default. The in-kernel partition
* scanning can be requested individually per-device during its
* setup. Userspace can always add and remove partitions from all
* devices. The needed partition minors are allocated from the
* extended minor space, the main loop device numbers will continue
* to match the loop minors, regardless of the number of partitions
* used.
*
* If max_part is given, partition scanning is globally enabled for
* all loop devices. The minors for the main loop devices will be
* multiples of max_part.
*
* Note: Global-for-all-devices, set-only-at-init, read-only module
* parameteters like 'max_loop' and 'max_part' make things needlessly
* complicated, are too static, inflexible and may surprise
* userspace tools. Parameters like this in general should be avoided.
*/
if
(
!
part_shift
)
disk
->
flags
|=
GENHD_FL_NO_PART_SCAN
;
disk
->
flags
|=
GENHD_FL_EXT_DEVT
;
mutex_init
(
&
lo
->
lo_ctl_mutex
);
atomic_set
(
&
lo
->
lo_refcnt
,
0
);
lo
->
lo_number
=
i
;
spin_lock_init
(
&
lo
->
lo_lock
);
disk
->
major
=
LOOP_MAJOR
;
disk
->
first_minor
=
i
<<
part_shift
;
disk
->
fops
=
&
lo_fops
;
disk
->
private_data
=
lo
;
disk
->
queue
=
lo
->
lo_queue
;
sprintf
(
disk
->
disk_name
,
"loop%d"
,
i
);
add_disk
(
disk
);
*
l
=
lo
;
return
lo
->
lo_number
;
out_free_queue:
blk_cleanup_queue
(
lo
->
lo_queue
);
out_cleanup_tags:
blk_mq_free_tag_set
(
&
lo
->
tag_set
);
out_free_idr:
idr_remove
(
&
loop_index_idr
,
i
);
out_free_dev:
kfree
(
lo
);
out:
return
err
;
}
static
void
loop_remove
(
struct
loop_device
*
lo
)
{
del_gendisk
(
lo
->
lo_disk
);
blk_cleanup_queue
(
lo
->
lo_queue
);
blk_mq_free_tag_set
(
&
lo
->
tag_set
);
put_disk
(
lo
->
lo_disk
);
kfree
(
lo
);
}
static
int
find_free_cb
(
int
id
,
void
*
ptr
,
void
*
data
)
{
struct
loop_device
*
lo
=
ptr
;
struct
loop_device
**
l
=
data
;
if
(
lo
->
lo_state
==
Lo_unbound
)
{
*
l
=
lo
;
return
1
;
}
return
0
;
}
static
int
loop_lookup
(
struct
loop_device
**
l
,
int
i
)
{
struct
loop_device
*
lo
;
int
ret
=
-
ENODEV
;
if
(
i
<
0
)
{
int
err
;
err
=
idr_for_each
(
&
loop_index_idr
,
&
find_free_cb
,
&
lo
);
if
(
err
==
1
)
{
*
l
=
lo
;
ret
=
lo
->
lo_number
;
}
goto
out
;
}
/* lookup and return a specific i */
lo
=
idr_find
(
&
loop_index_idr
,
i
);
if
(
lo
)
{
*
l
=
lo
;
ret
=
lo
->
lo_number
;
}
out:
return
ret
;
}
static
struct
kobject
*
loop_probe
(
dev_t
dev
,
int
*
part
,
void
*
data
)
{
struct
loop_device
*
lo
;
struct
kobject
*
kobj
;
int
err
;
mutex_lock
(
&
loop_index_mutex
);
err
=
loop_lookup
(
&
lo
,
MINOR
(
dev
)
>>
part_shift
);
if
(
err
<
0
)
err
=
loop_add
(
&
lo
,
MINOR
(
dev
)
>>
part_shift
);
if
(
err
<
0
)
kobj
=
NULL
;
else
kobj
=
get_disk_and_module
(
lo
->
lo_disk
);
mutex_unlock
(
&
loop_index_mutex
);
*
part
=
0
;
return
kobj
;
}
static
long
loop_control_ioctl
(
struct
file
*
file
,
unsigned
int
cmd
,
unsigned
long
parm
)
{
struct
loop_device
*
lo
;
int
ret
=
-
ENOSYS
;
mutex_lock
(
&
loop_index_mutex
);
switch
(
cmd
)
{
case
LOOP_CTL_ADD
:
ret
=
loop_lookup
(
&
lo
,
parm
);
if
(
ret
>=
0
)
{
ret
=
-
EEXIST
;
break
;
}
ret
=
loop_add
(
&
lo
,
parm
);
break
;
case
LOOP_CTL_REMOVE
:
ret
=
loop_lookup
(
&
lo
,
parm
);
if
(
ret
<
0
)
break
;
ret
=
mutex_lock_killable
(
&
lo
->
lo_ctl_mutex
);
if
(
ret
)
break
;
if
(
lo
->
lo_state
!=
Lo_unbound
)
{
ret
=
-
EBUSY
;
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
break
;
}
if
(
atomic_read
(
&
lo
->
lo_refcnt
)
>
0
)
{
ret
=
-
EBUSY
;
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
break
;
}
lo
->
lo_disk
->
private_data
=
NULL
;
mutex_unlock
(
&
lo
->
lo_ctl_mutex
);
idr_remove
(
&
loop_index_idr
,
lo
->
lo_number
);
loop_remove
(
lo
);
break
;
case
LOOP_CTL_GET_FREE
:
ret
=
loop_lookup
(
&
lo
,
-
1
);
if
(
ret
>=
0
)
break
;
ret
=
loop_add
(
&
lo
,
-
1
);
}
mutex_unlock
(
&
loop_index_mutex
);
return
ret
;
}
static
const
struct
file_operations
loop_ctl_fops
=
{
.
open
=
nonseekable_open
,
.
unlocked_ioctl
=
loop_control_ioctl
,
.
compat_ioctl
=
loop_control_ioctl
,
.
owner
=
THIS_MODULE
,
.
llseek
=
noop_llseek
,
};
static
struct
miscdevice
loop_misc
=
{
.
minor
=
LOOP_CTRL_MINOR
,
.
name
=
"loop-control"
,
.
fops
=
&
loop_ctl_fops
,
};
MODULE_ALIAS_MISCDEV
(
LOOP_CTRL_MINOR
);
MODULE_ALIAS
(
"devname:loop-control"
);
static
int
__init
loop_init
(
void
)
{
int
i
,
nr
;
unsigned
long
range
;
struct
loop_device
*
lo
;
int
err
;
part_shift
=
0
;
if
(
max_part
>
0
)
{
part_shift
=
fls
(
max_part
);
/*
* Adjust max_part according to part_shift as it is exported
* to user space so that user can decide correct minor number
* if [s]he want to create more devices.
*
* Note that -1 is required because partition 0 is reserved
* for the whole disk.
*/
max_part
=
(
1UL
<<
part_shift
)
-
1
;
}
if
((
1UL
<<
part_shift
)
>
DISK_MAX_PARTS
)
{
err
=
-
EINVAL
;
goto
err_out
;
}
if
(
max_loop
>
1UL
<<
(
MINORBITS
-
part_shift
))
{
err
=
-
EINVAL
;
goto
err_out
;
}
/*
* If max_loop is specified, create that many devices upfront.
* This also becomes a hard limit. If max_loop is not specified,
* create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
* init time. Loop devices can be requested on-demand with the
* /dev/loop-control interface, or be instantiated by accessing
* a 'dead' device node.
*/
if
(
max_loop
)
{
nr
=
max_loop
;
range
=
max_loop
<<
part_shift
;
}
else
{
nr
=
CONFIG_BLK_DEV_LOOP_MIN_COUNT
;
range
=
1UL
<<
MINORBITS
;
}
err
=
misc_register
(
&
loop_misc
);
if
(
err
<
0
)
goto
err_out
;
if
(
register_blkdev
(
LOOP_MAJOR
,
"loop"
))
{
err
=
-
EIO
;
goto
misc_out
;
}
blk_register_region
(
MKDEV
(
LOOP_MAJOR
,
0
),
range
,
THIS_MODULE
,
loop_probe
,
NULL
,
NULL
);
/* pre-create number of devices given by config or max_loop */
mutex_lock
(
&
loop_index_mutex
);
for
(
i
=
0
;
i
<
nr
;
i
++
)
loop_add
(
&
lo
,
i
);
mutex_unlock
(
&
loop_index_mutex
);
printk
(
KERN_INFO
"loop: module loaded
\n
"
);
return
0
;
misc_out:
misc_deregister
(
&
loop_misc
);
err_out:
return
err
;
}
static
int
loop_exit_cb
(
int
id
,
void
*
ptr
,
void
*
data
)
{
struct
loop_device
*
lo
=
ptr
;
loop_remove
(
lo
);
return
0
;
}
static
void
__exit
loop_exit
(
void
)
{
unsigned
long
range
;
range
=
max_loop
?
max_loop
<<
part_shift
:
1UL
<<
MINORBITS
;
idr_for_each
(
&
loop_index_idr
,
&
loop_exit_cb
,
NULL
);
idr_destroy
(
&
loop_index_idr
);
blk_unregister_region
(
MKDEV
(
LOOP_MAJOR
,
0
),
range
);
unregister_blkdev
(
LOOP_MAJOR
,
"loop"
);
misc_deregister
(
&
loop_misc
);
}
module_init
(
loop_init
);
module_exit
(
loop_exit
);
#ifndef MODULE
static
int
__init
max_loop_setup
(
char
*
str
)
{
max_loop
=
simple_strtol
(
str
,
NULL
,
0
);
return
1
;
}
__setup
(
"max_loop="
,
max_loop_setup
);
#endif
src/fs/block_dev.c
View file @
f1301897
...
@@ -174,7 +174,7 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
...
@@ -174,7 +174,7 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
return
op
;
return
op
;
}
}
#define DIO_INLINE_BIO_VECS 4
#define DIO_INLINE_BIO_VECS
BIO_MAX_PAGES //
4
static
void
blkdev_bio_end_io_simple
(
struct
bio
*
bio
)
static
void
blkdev_bio_end_io_simple
(
struct
bio
*
bio
)
{
{
...
@@ -197,11 +197,13 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
...
@@ -197,11 +197,13 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
ssize_t
ret
;
ssize_t
ret
;
blk_qc_t
qc
;
blk_qc_t
qc
;
int
i
;
int
i
;
pr_debug
(
"pos = %lld, nr_pages = %d, type=%d, iov_offset=0x%x, count=%d, iter->iov->iov_base=%p, iter->iov->iov_len=%d, nr_segs = %ld"
,
\
pos
,
nr_pages
,
iter
->
type
,
iter
->
iov_offset
,
iter
->
count
,
iter
->
iov
->
iov_base
,
iter
->
iov
->
iov_len
,
iter
->
nr_segs
);
// pos - in file
if
((
pos
|
iov_iter_alignment
(
iter
))
&
if
((
pos
|
iov_iter_alignment
(
iter
))
&
(
bdev_logical_block_size
(
bdev
)
-
1
))
(
bdev_logical_block_size
(
bdev
)
-
1
))
{
pr_debug
(
"pos = %lld, iov_iter_alignment(iter) = %ld, nr_pages = %d, bdev_logical_block_size(bdev) = %d"
,
pos
,
iov_iter_alignment
(
iter
),
nr_pages
,
bdev_logical_block_size
(
bdev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
if
(
nr_pages
<=
DIO_INLINE_BIO_VECS
)
if
(
nr_pages
<=
DIO_INLINE_BIO_VECS
)
vecs
=
inline_vecs
;
vecs
=
inline_vecs
;
else
{
else
{
...
@@ -210,7 +212,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
...
@@ -210,7 +212,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
if
(
!
vecs
)
if
(
!
vecs
)
return
-
ENOMEM
;
return
-
ENOMEM
;
}
}
pr_debug
(
"DIO_INLINE_BIO_VECS = %d, nr_pages = %d"
,
DIO_INLINE_BIO_VECS
,
nr_pages
);
bio_init
(
&
bio
,
vecs
,
nr_pages
);
bio_init
(
&
bio
,
vecs
,
nr_pages
);
bio_set_dev
(
&
bio
,
bdev
);
bio_set_dev
(
&
bio
,
bdev
);
bio
.
bi_iter
.
bi_sector
=
pos
>>
9
;
bio
.
bi_iter
.
bi_sector
=
pos
>>
9
;
...
@@ -218,12 +220,12 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
...
@@ -218,12 +220,12 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
bio
.
bi_private
=
current
;
bio
.
bi_private
=
current
;
bio
.
bi_end_io
=
blkdev_bio_end_io_simple
;
bio
.
bi_end_io
=
blkdev_bio_end_io_simple
;
bio
.
bi_ioprio
=
iocb
->
ki_ioprio
;
bio
.
bi_ioprio
=
iocb
->
ki_ioprio
;
ret
=
bio_iov_iter_get_pages
(
&
bio
,
iter
);
ret
=
bio_iov_iter_get_pages
(
&
bio
,
iter
);
pr_debug
(
"ret = %d"
,
ret
);
// -14
if
(
unlikely
(
ret
))
if
(
unlikely
(
ret
))
goto
out
;
goto
out
;
ret
=
bio
.
bi_iter
.
bi_size
;
ret
=
bio
.
bi_iter
.
bi_size
;
// Does nothing as CONFIG_TASK_IO_ACCOUNTING in not defined
if
(
iov_iter_rw
(
iter
)
==
READ
)
{
if
(
iov_iter_rw
(
iter
)
==
READ
)
{
bio
.
bi_opf
=
REQ_OP_READ
;
bio
.
bi_opf
=
REQ_OP_READ
;
if
(
iter_is_iovec
(
iter
))
if
(
iter_is_iovec
(
iter
))
...
@@ -243,12 +245,13 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
...
@@ -243,12 +245,13 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
io_schedule
();
io_schedule
();
}
}
__set_current_state
(
TASK_RUNNING
);
__set_current_state
(
TASK_RUNNING
);
pr_debug
(
"after __set_current_state(TASK_RUNNING)"
);
bio_for_each_segment_all
(
bvec
,
&
bio
,
i
)
{
bio_for_each_segment_all
(
bvec
,
&
bio
,
i
)
{
if
(
should_dirty
&&
!
PageCompound
(
bvec
->
bv_page
))
if
(
should_dirty
&&
!
PageCompound
(
bvec
->
bv_page
))
set_page_dirty_lock
(
bvec
->
bv_page
);
set_page_dirty_lock
(
bvec
->
bv_page
);
put_page
(
bvec
->
bv_page
);
put_page
(
bvec
->
bv_page
);
}
}
pr_debug
(
"bio.bi_status = %d"
,
bio
.
bi_status
);
if
(
unlikely
(
bio
.
bi_status
))
if
(
unlikely
(
bio
.
bi_status
))
ret
=
blk_status_to_errno
(
bio
.
bi_status
);
ret
=
blk_status_to_errno
(
bio
.
bi_status
);
...
@@ -332,11 +335,12 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
...
@@ -332,11 +335,12 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
loff_t
pos
=
iocb
->
ki_pos
;
loff_t
pos
=
iocb
->
ki_pos
;
blk_qc_t
qc
=
BLK_QC_T_NONE
;
blk_qc_t
qc
=
BLK_QC_T_NONE
;
int
ret
=
0
;
int
ret
=
0
;
pr_debug
(
"pos = %lld, nr_pages = %d"
,
pos
,
nr_pages
);
if
((
pos
|
iov_iter_alignment
(
iter
))
&
if
((
pos
|
iov_iter_alignment
(
iter
))
&
(
bdev_logical_block_size
(
bdev
)
-
1
))
(
bdev_logical_block_size
(
bdev
)
-
1
))
{
pr_debug
(
"pos = %lld, iov_iter_alignment(iter) = %ld, nr_pages = %d, bdev_logical_block_size(bdev) = %d"
,
pos
,
iov_iter_alignment
(
iter
),
nr_pages
,
bdev_logical_block_size
(
bdev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
bio
=
bio_alloc_bioset
(
GFP_KERNEL
,
nr_pages
,
&
blkdev_dio_pool
);
bio
=
bio_alloc_bioset
(
GFP_KERNEL
,
nr_pages
,
&
blkdev_dio_pool
);
bio_get
(
bio
);
/* extra ref for the completion handler */
bio_get
(
bio
);
/* extra ref for the completion handler */
...
@@ -424,8 +428,8 @@ static ssize_t
...
@@ -424,8 +428,8 @@ static ssize_t
blkdev_direct_IO
(
struct
kiocb
*
iocb
,
struct
iov_iter
*
iter
)
blkdev_direct_IO
(
struct
kiocb
*
iocb
,
struct
iov_iter
*
iter
)
{
{
int
nr_pages
;
int
nr_pages
;
nr_pages
=
iov_iter_npages
(
iter
,
BIO_MAX_PAGES
+
1
);
nr_pages
=
iov_iter_npages
(
iter
,
BIO_MAX_PAGES
+
1
);
pr_debug
(
"nr_pages = %d, is_sync_kiocb(iocb) = %d"
,
nr_pages
,
is_sync_kiocb
(
iocb
));
if
(
!
nr_pages
)
if
(
!
nr_pages
)
return
0
;
return
0
;
if
(
is_sync_kiocb
(
iocb
)
&&
nr_pages
<=
BIO_MAX_PAGES
)
if
(
is_sync_kiocb
(
iocb
)
&&
nr_pages
<=
BIO_MAX_PAGES
)
...
...
src/fs/read_write.c
View file @
f1301897
...
@@ -466,7 +466,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
...
@@ -466,7 +466,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
struct
kiocb
kiocb
;
struct
kiocb
kiocb
;
struct
iov_iter
iter
;
struct
iov_iter
iter
;
ssize_t
ret
;
ssize_t
ret
;
if
(
!
strncmp
(
filp
->
f_path
.
dentry
->
d_name
.
name
,
"sda2"
,
4
))
{
pr_debug
(
"pos=%llu, len=%d, buf=0x%p"
,
*
ppos
,
len
,
buf
);
// pr_debug("pos=%llu, len=%d, buf=0x%p, data=0x%llx", *ppos, len, buf, ((loff_t*) (buf))[0]);
}
init_sync_kiocb
(
&
kiocb
,
filp
);
init_sync_kiocb
(
&
kiocb
,
filp
);
kiocb
.
ki_pos
=
*
ppos
;
kiocb
.
ki_pos
=
*
ppos
;
iov_iter_init
(
&
iter
,
WRITE
,
&
iov
,
1
,
len
);
iov_iter_init
(
&
iter
,
WRITE
,
&
iov
,
1
,
len
);
...
@@ -481,6 +483,10 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
...
@@ -481,6 +483,10 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
ssize_t
__vfs_write
(
struct
file
*
file
,
const
char
__user
*
p
,
size_t
count
,
ssize_t
__vfs_write
(
struct
file
*
file
,
const
char
__user
*
p
,
size_t
count
,
loff_t
*
pos
)
loff_t
*
pos
)
{
{
if
(
!
strncmp
(
file
->
f_path
.
dentry
->
d_name
.
name
,
"sda2"
,
4
))
{
pr_debug
(
"pos=%llu, count=%d, p=0x%p"
,
*
pos
,
count
,
p
);
// pr_debug("pos=%llu, count=%d, p=0x%p, data=0x%llx", *pos, count, p, ((loff_t*) (p))[0]);
}
if
(
file
->
f_op
->
write
)
if
(
file
->
f_op
->
write
)
return
file
->
f_op
->
write
(
file
,
p
,
count
,
pos
);
return
file
->
f_op
->
write
(
file
,
p
,
count
,
pos
);
else
if
(
file
->
f_op
->
write_iter
)
else
if
(
file
->
f_op
->
write_iter
)
...
@@ -661,19 +667,22 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
...
@@ -661,19 +667,22 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
{
{
return
ksys_pwrite64
(
fd
,
buf
,
count
,
pos
);
return
ksys_pwrite64
(
fd
,
buf
,
count
,
pos
);
}
}
static
ssize_t
do_iter_readv_writev
(
struct
file
*
filp
,
struct
iov_iter
*
iter
,
static
ssize_t
do_iter_readv_writev
(
struct
file
*
filp
,
struct
iov_iter
*
iter
,
loff_t
*
ppos
,
int
type
,
rwf_t
flags
)
loff_t
*
ppos
,
int
type
,
rwf_t
flags
)
{
{
struct
kiocb
kiocb
;
struct
kiocb
kiocb
;
ssize_t
ret
;
ssize_t
ret
;
pr_debug
(
"pos=%llu, flags=0x%x, type=%s"
,
*
ppos
,
flags
,
type
?
"WRITE"
:
"READ"
);
if
(
!
strncmp
(
filp
->
f_path
.
dentry
->
d_name
.
name
,
"sda2"
,
4
))
{
pr_debug
(
"pos=%llu, flags=0x%x, type=%s"
,
*
ppos
,
flags
,
type
?
"WRITE"
:
"READ"
);
pr_debug
(
"iov_offset=0x%x, count=%d, iter->iov->iov_base=%p, iter->iov->iov_len=%d, nr_segs = %ld"
,
\
iter
->
iov_offset
,
iter
->
count
,
iter
->
iov
->
iov_base
,
iter
->
iov
->
iov_len
,
iter
->
nr_segs
);
// pos - in file
// pr_debug("data=0x%llx", ((loff_t*) (iter->iov->iov_base))[0]);
}
init_sync_kiocb
(
&
kiocb
,
filp
);
init_sync_kiocb
(
&
kiocb
,
filp
);
ret
=
kiocb_set_rw_flags
(
&
kiocb
,
flags
);
ret
=
kiocb_set_rw_flags
(
&
kiocb
,
flags
);
if
(
ret
)
if
(
ret
)
return
ret
;
return
ret
;
kiocb
.
ki_pos
=
*
ppos
;
kiocb
.
ki_pos
=
*
ppos
;
if
(
type
==
READ
)
if
(
type
==
READ
)
ret
=
call_read_iter
(
filp
,
&
kiocb
,
iter
);
ret
=
call_read_iter
(
filp
,
&
kiocb
,
iter
);
else
else
...
...
src/lib/iov_iter.c
0 → 100644
View file @
f1301897
#include <linux/export.h>
#include <linux/bvec.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/splice.h>
#include <net/checksum.h>
#define PIPE_PARANOIA
/* for now */
#define iterate_iovec(i, n, __v, __p, skip, STEP) { \
size_t left; \
size_t wanted = n; \
__p = i->iov; \
__v.iov_len = min(n, __p->iov_len - skip); \
if (likely(__v.iov_len)) { \
__v.iov_base = __p->iov_base + skip; \
left = (STEP); \
__v.iov_len -= left; \
skip += __v.iov_len; \
n -= __v.iov_len; \
} else { \
left = 0; \
} \
while (unlikely(!left && n)) { \
__p++; \
__v.iov_len = min(n, __p->iov_len); \
if (unlikely(!__v.iov_len)) \
continue; \
__v.iov_base = __p->iov_base; \
left = (STEP); \
__v.iov_len -= left; \
skip = __v.iov_len; \
n -= __v.iov_len; \
} \
n = wanted - n; \
}
#define iterate_kvec(i, n, __v, __p, skip, STEP) { \
size_t wanted = n; \
__p = i->kvec; \
__v.iov_len = min(n, __p->iov_len - skip); \
if (likely(__v.iov_len)) { \
__v.iov_base = __p->iov_base + skip; \
(void)(STEP); \
skip += __v.iov_len; \
n -= __v.iov_len; \
} \
while (unlikely(n)) { \
__p++; \
__v.iov_len = min(n, __p->iov_len); \
if (unlikely(!__v.iov_len)) \
continue; \
__v.iov_base = __p->iov_base; \
(void)(STEP); \
skip = __v.iov_len; \
n -= __v.iov_len; \
} \
n = wanted; \
}
#define iterate_bvec(i, n, __v, __bi, skip, STEP) { \
struct bvec_iter __start; \
__start.bi_size = n; \
__start.bi_bvec_done = skip; \
__start.bi_idx = 0; \
for_each_bvec(__v, i->bvec, __bi, __start) { \
if (!__v.bv_len) \
continue; \
(void)(STEP); \
} \
}
#define iterate_all_kinds(i, n, v, I, B, K) { \
if (likely(n)) { \
size_t skip = i->iov_offset; \
if (unlikely(i->type & ITER_BVEC)) { \
struct bio_vec v; \
struct bvec_iter __bi; \
iterate_bvec(i, n, v, __bi, skip, (B)) \
} else if (unlikely(i->type & ITER_KVEC)) { \
const struct kvec *kvec; \
struct kvec v; \
iterate_kvec(i, n, v, kvec, skip, (K)) \
} else { \
const struct iovec *iov; \
struct iovec v; \
iterate_iovec(i, n, v, iov, skip, (I)) \
} \
} \
}
#define iterate_and_advance(i, n, v, I, B, K) { \
if (unlikely(i->count < n)) \
n = i->count; \
if (i->count) { \
size_t skip = i->iov_offset; \
if (unlikely(i->type & ITER_BVEC)) { \
const struct bio_vec *bvec = i->bvec; \
struct bio_vec v; \
struct bvec_iter __bi; \
iterate_bvec(i, n, v, __bi, skip, (B)) \
i->bvec = __bvec_iter_bvec(i->bvec, __bi); \
i->nr_segs -= i->bvec - bvec; \
skip = __bi.bi_bvec_done; \
} else if (unlikely(i->type & ITER_KVEC)) { \
const struct kvec *kvec; \
struct kvec v; \
iterate_kvec(i, n, v, kvec, skip, (K)) \
if (skip == kvec->iov_len) { \
kvec++; \
skip = 0; \
} \
i->nr_segs -= kvec - i->kvec; \
i->kvec = kvec; \
} else { \
const struct iovec *iov; \
struct iovec v; \
iterate_iovec(i, n, v, iov, skip, (I)) \
if (skip == iov->iov_len) { \
iov++; \
skip = 0; \
} \
i->nr_segs -= iov - i->iov; \
i->iov = iov; \
} \
i->count -= n; \
i->iov_offset = skip; \
} \
}
static
int
copyout
(
void
__user
*
to
,
const
void
*
from
,
size_t
n
)
{
if
(
access_ok
(
VERIFY_WRITE
,
to
,
n
))
{
kasan_check_read
(
from
,
n
);
n
=
raw_copy_to_user
(
to
,
from
,
n
);
}
return
n
;
}
static
int
copyin
(
void
*
to
,
const
void
__user
*
from
,
size_t
n
)
{
if
(
access_ok
(
VERIFY_READ
,
from
,
n
))
{
kasan_check_write
(
to
,
n
);
n
=
raw_copy_from_user
(
to
,
from
,
n
);
}
return
n
;
}
static
size_t
copy_page_to_iter_iovec
(
struct
page
*
page
,
size_t
offset
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
size_t
skip
,
copy
,
left
,
wanted
;
const
struct
iovec
*
iov
;
char
__user
*
buf
;
void
*
kaddr
,
*
from
;
if
(
unlikely
(
bytes
>
i
->
count
))
bytes
=
i
->
count
;
if
(
unlikely
(
!
bytes
))
return
0
;
might_fault
();
wanted
=
bytes
;
iov
=
i
->
iov
;
skip
=
i
->
iov_offset
;
buf
=
iov
->
iov_base
+
skip
;
copy
=
min
(
bytes
,
iov
->
iov_len
-
skip
);
if
(
IS_ENABLED
(
CONFIG_HIGHMEM
)
&&
!
fault_in_pages_writeable
(
buf
,
copy
))
{
kaddr
=
kmap_atomic
(
page
);
from
=
kaddr
+
offset
;
/* first chunk, usually the only one */
left
=
copyout
(
buf
,
from
,
copy
);
copy
-=
left
;
skip
+=
copy
;
from
+=
copy
;
bytes
-=
copy
;
while
(
unlikely
(
!
left
&&
bytes
))
{
iov
++
;
buf
=
iov
->
iov_base
;
copy
=
min
(
bytes
,
iov
->
iov_len
);
left
=
copyout
(
buf
,
from
,
copy
);
copy
-=
left
;
skip
=
copy
;
from
+=
copy
;
bytes
-=
copy
;
}
if
(
likely
(
!
bytes
))
{
kunmap_atomic
(
kaddr
);
goto
done
;
}
offset
=
from
-
kaddr
;
buf
+=
copy
;
kunmap_atomic
(
kaddr
);
copy
=
min
(
bytes
,
iov
->
iov_len
-
skip
);
}
/* Too bad - revert to non-atomic kmap */
kaddr
=
kmap
(
page
);
from
=
kaddr
+
offset
;
left
=
copyout
(
buf
,
from
,
copy
);
copy
-=
left
;
skip
+=
copy
;
from
+=
copy
;
bytes
-=
copy
;
while
(
unlikely
(
!
left
&&
bytes
))
{
iov
++
;
buf
=
iov
->
iov_base
;
copy
=
min
(
bytes
,
iov
->
iov_len
);
left
=
copyout
(
buf
,
from
,
copy
);
copy
-=
left
;
skip
=
copy
;
from
+=
copy
;
bytes
-=
copy
;
}
kunmap
(
page
);
done:
if
(
skip
==
iov
->
iov_len
)
{
iov
++
;
skip
=
0
;
}
i
->
count
-=
wanted
-
bytes
;
i
->
nr_segs
-=
iov
-
i
->
iov
;
i
->
iov
=
iov
;
i
->
iov_offset
=
skip
;
return
wanted
-
bytes
;
}
static
size_t
copy_page_from_iter_iovec
(
struct
page
*
page
,
size_t
offset
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
size_t
skip
,
copy
,
left
,
wanted
;
const
struct
iovec
*
iov
;
char
__user
*
buf
;
void
*
kaddr
,
*
to
;
if
(
unlikely
(
bytes
>
i
->
count
))
bytes
=
i
->
count
;
if
(
unlikely
(
!
bytes
))
return
0
;
might_fault
();
wanted
=
bytes
;
iov
=
i
->
iov
;
skip
=
i
->
iov_offset
;
buf
=
iov
->
iov_base
+
skip
;
copy
=
min
(
bytes
,
iov
->
iov_len
-
skip
);
if
(
IS_ENABLED
(
CONFIG_HIGHMEM
)
&&
!
fault_in_pages_readable
(
buf
,
copy
))
{
kaddr
=
kmap_atomic
(
page
);
to
=
kaddr
+
offset
;
/* first chunk, usually the only one */
left
=
copyin
(
to
,
buf
,
copy
);
copy
-=
left
;
skip
+=
copy
;
to
+=
copy
;
bytes
-=
copy
;
while
(
unlikely
(
!
left
&&
bytes
))
{
iov
++
;
buf
=
iov
->
iov_base
;
copy
=
min
(
bytes
,
iov
->
iov_len
);
left
=
copyin
(
to
,
buf
,
copy
);
copy
-=
left
;
skip
=
copy
;
to
+=
copy
;
bytes
-=
copy
;
}
if
(
likely
(
!
bytes
))
{
kunmap_atomic
(
kaddr
);
goto
done
;
}
offset
=
to
-
kaddr
;
buf
+=
copy
;
kunmap_atomic
(
kaddr
);
copy
=
min
(
bytes
,
iov
->
iov_len
-
skip
);
}
/* Too bad - revert to non-atomic kmap */
kaddr
=
kmap
(
page
);
to
=
kaddr
+
offset
;
left
=
copyin
(
to
,
buf
,
copy
);
copy
-=
left
;
skip
+=
copy
;
to
+=
copy
;
bytes
-=
copy
;
while
(
unlikely
(
!
left
&&
bytes
))
{
iov
++
;
buf
=
iov
->
iov_base
;
copy
=
min
(
bytes
,
iov
->
iov_len
);
left
=
copyin
(
to
,
buf
,
copy
);
copy
-=
left
;
skip
=
copy
;
to
+=
copy
;
bytes
-=
copy
;
}
kunmap
(
page
);
done:
if
(
skip
==
iov
->
iov_len
)
{
iov
++
;
skip
=
0
;
}
i
->
count
-=
wanted
-
bytes
;
i
->
nr_segs
-=
iov
-
i
->
iov
;
i
->
iov
=
iov
;
i
->
iov_offset
=
skip
;
return
wanted
-
bytes
;
}
#ifdef PIPE_PARANOIA
static
bool
sanity
(
const
struct
iov_iter
*
i
)
{
struct
pipe_inode_info
*
pipe
=
i
->
pipe
;
int
idx
=
i
->
idx
;
int
next
=
pipe
->
curbuf
+
pipe
->
nrbufs
;
if
(
i
->
iov_offset
)
{
struct
pipe_buffer
*
p
;
if
(
unlikely
(
!
pipe
->
nrbufs
))
goto
Bad
;
// pipe must be non-empty
if
(
unlikely
(
idx
!=
((
next
-
1
)
&
(
pipe
->
buffers
-
1
))))
goto
Bad
;
// must be at the last buffer...
p
=
&
pipe
->
bufs
[
idx
];
if
(
unlikely
(
p
->
offset
+
p
->
len
!=
i
->
iov_offset
))
goto
Bad
;
// ... at the end of segment
}
else
{
if
(
idx
!=
(
next
&
(
pipe
->
buffers
-
1
)))
goto
Bad
;
// must be right after the last buffer
}
return
true
;
Bad:
printk
(
KERN_ERR
"idx = %d, offset = %zd
\n
"
,
i
->
idx
,
i
->
iov_offset
);
printk
(
KERN_ERR
"curbuf = %d, nrbufs = %d, buffers = %d
\n
"
,
pipe
->
curbuf
,
pipe
->
nrbufs
,
pipe
->
buffers
);
for
(
idx
=
0
;
idx
<
pipe
->
buffers
;
idx
++
)
printk
(
KERN_ERR
"[%p %p %d %d]
\n
"
,
pipe
->
bufs
[
idx
].
ops
,
pipe
->
bufs
[
idx
].
page
,
pipe
->
bufs
[
idx
].
offset
,
pipe
->
bufs
[
idx
].
len
);
WARN_ON
(
1
);
return
false
;
}
#else
#define sanity(i) true
#endif
static
inline
int
next_idx
(
int
idx
,
struct
pipe_inode_info
*
pipe
)
{
return
(
idx
+
1
)
&
(
pipe
->
buffers
-
1
);
}
static
size_t
copy_page_to_iter_pipe
(
struct
page
*
page
,
size_t
offset
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
struct
pipe_inode_info
*
pipe
=
i
->
pipe
;
struct
pipe_buffer
*
buf
;
size_t
off
;
int
idx
;
if
(
unlikely
(
bytes
>
i
->
count
))
bytes
=
i
->
count
;
if
(
unlikely
(
!
bytes
))
return
0
;
if
(
!
sanity
(
i
))
return
0
;
off
=
i
->
iov_offset
;
idx
=
i
->
idx
;
buf
=
&
pipe
->
bufs
[
idx
];
if
(
off
)
{
if
(
offset
==
off
&&
buf
->
page
==
page
)
{
/* merge with the last one */
buf
->
len
+=
bytes
;
i
->
iov_offset
+=
bytes
;
goto
out
;
}
idx
=
next_idx
(
idx
,
pipe
);
buf
=
&
pipe
->
bufs
[
idx
];
}
if
(
idx
==
pipe
->
curbuf
&&
pipe
->
nrbufs
)
return
0
;
pipe
->
nrbufs
++
;
buf
->
ops
=
&
page_cache_pipe_buf_ops
;
get_page
(
buf
->
page
=
page
);
buf
->
offset
=
offset
;
buf
->
len
=
bytes
;
i
->
iov_offset
=
offset
+
bytes
;
i
->
idx
=
idx
;
out:
i
->
count
-=
bytes
;
return
bytes
;
}
/*
* Fault in one or more iovecs of the given iov_iter, to a maximum length of
* bytes. For each iovec, fault in each page that constitutes the iovec.
*
* Return 0 on success, or non-zero if the memory could not be accessed (i.e.
* because it is an invalid address).
*/
int
iov_iter_fault_in_readable
(
struct
iov_iter
*
i
,
size_t
bytes
)
{
size_t
skip
=
i
->
iov_offset
;
const
struct
iovec
*
iov
;
int
err
;
struct
iovec
v
;
if
(
!
(
i
->
type
&
(
ITER_BVEC
|
ITER_KVEC
)))
{
iterate_iovec
(
i
,
bytes
,
v
,
iov
,
skip
,
({
err
=
fault_in_pages_readable
(
v
.
iov_base
,
v
.
iov_len
);
if
(
unlikely
(
err
))
return
err
;
0
;}))
}
return
0
;
}
EXPORT_SYMBOL
(
iov_iter_fault_in_readable
);
void
iov_iter_init
(
struct
iov_iter
*
i
,
int
direction
,
const
struct
iovec
*
iov
,
unsigned
long
nr_segs
,
size_t
count
)
{
/* It will get better. Eventually... */
if
(
uaccess_kernel
())
{
direction
|=
ITER_KVEC
;
i
->
type
=
direction
;
i
->
kvec
=
(
struct
kvec
*
)
iov
;
}
else
{
i
->
type
=
direction
;
i
->
iov
=
iov
;
}
i
->
nr_segs
=
nr_segs
;
i
->
iov_offset
=
0
;
i
->
count
=
count
;
}
EXPORT_SYMBOL
(
iov_iter_init
);
static
void
memcpy_from_page
(
char
*
to
,
struct
page
*
page
,
size_t
offset
,
size_t
len
)
{
char
*
from
=
kmap_atomic
(
page
);
memcpy
(
to
,
from
+
offset
,
len
);
kunmap_atomic
(
from
);
}
static
void
memcpy_to_page
(
struct
page
*
page
,
size_t
offset
,
const
char
*
from
,
size_t
len
)
{
char
*
to
=
kmap_atomic
(
page
);
memcpy
(
to
+
offset
,
from
,
len
);
kunmap_atomic
(
to
);
}
static
void
memzero_page
(
struct
page
*
page
,
size_t
offset
,
size_t
len
)
{
char
*
addr
=
kmap_atomic
(
page
);
memset
(
addr
+
offset
,
0
,
len
);
kunmap_atomic
(
addr
);
}
static
inline
bool
allocated
(
struct
pipe_buffer
*
buf
)
{
return
buf
->
ops
==
&
default_pipe_buf_ops
;
}
static
inline
void
data_start
(
const
struct
iov_iter
*
i
,
int
*
idxp
,
size_t
*
offp
)
{
size_t
off
=
i
->
iov_offset
;
int
idx
=
i
->
idx
;
if
(
off
&&
(
!
allocated
(
&
i
->
pipe
->
bufs
[
idx
])
||
off
==
PAGE_SIZE
))
{
idx
=
next_idx
(
idx
,
i
->
pipe
);
off
=
0
;
}
*
idxp
=
idx
;
*
offp
=
off
;
}
static
size_t
push_pipe
(
struct
iov_iter
*
i
,
size_t
size
,
int
*
idxp
,
size_t
*
offp
)
{
struct
pipe_inode_info
*
pipe
=
i
->
pipe
;
size_t
off
;
int
idx
;
ssize_t
left
;
if
(
unlikely
(
size
>
i
->
count
))
size
=
i
->
count
;
if
(
unlikely
(
!
size
))
return
0
;
left
=
size
;
data_start
(
i
,
&
idx
,
&
off
);
*
idxp
=
idx
;
*
offp
=
off
;
if
(
off
)
{
left
-=
PAGE_SIZE
-
off
;
if
(
left
<=
0
)
{
pipe
->
bufs
[
idx
].
len
+=
size
;
return
size
;
}
pipe
->
bufs
[
idx
].
len
=
PAGE_SIZE
;
idx
=
next_idx
(
idx
,
pipe
);
}
while
(
idx
!=
pipe
->
curbuf
||
!
pipe
->
nrbufs
)
{
struct
page
*
page
=
alloc_page
(
GFP_USER
);
if
(
!
page
)
break
;
pipe
->
nrbufs
++
;
pipe
->
bufs
[
idx
].
ops
=
&
default_pipe_buf_ops
;
pipe
->
bufs
[
idx
].
page
=
page
;
pipe
->
bufs
[
idx
].
offset
=
0
;
if
(
left
<=
PAGE_SIZE
)
{
pipe
->
bufs
[
idx
].
len
=
left
;
return
size
;
}
pipe
->
bufs
[
idx
].
len
=
PAGE_SIZE
;
left
-=
PAGE_SIZE
;
idx
=
next_idx
(
idx
,
pipe
);
}
return
size
-
left
;
}
static
size_t
copy_pipe_to_iter
(
const
void
*
addr
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
struct
pipe_inode_info
*
pipe
=
i
->
pipe
;
size_t
n
,
off
;
int
idx
;
if
(
!
sanity
(
i
))
return
0
;
bytes
=
n
=
push_pipe
(
i
,
bytes
,
&
idx
,
&
off
);
if
(
unlikely
(
!
n
))
return
0
;
for
(
;
n
;
idx
=
next_idx
(
idx
,
pipe
),
off
=
0
)
{
size_t
chunk
=
min_t
(
size_t
,
n
,
PAGE_SIZE
-
off
);
memcpy_to_page
(
pipe
->
bufs
[
idx
].
page
,
off
,
addr
,
chunk
);
i
->
idx
=
idx
;
i
->
iov_offset
=
off
+
chunk
;
n
-=
chunk
;
addr
+=
chunk
;
}
i
->
count
-=
bytes
;
return
bytes
;
}
size_t
_copy_to_iter
(
const
void
*
addr
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
const
char
*
from
=
addr
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
return
copy_pipe_to_iter
(
addr
,
bytes
,
i
);
if
(
iter_is_iovec
(
i
))
might_fault
();
iterate_and_advance
(
i
,
bytes
,
v
,
copyout
(
v
.
iov_base
,
(
from
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_len
),
memcpy_to_page
(
v
.
bv_page
,
v
.
bv_offset
,
(
from
+=
v
.
bv_len
)
-
v
.
bv_len
,
v
.
bv_len
),
memcpy
(
v
.
iov_base
,
(
from
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_len
)
)
return
bytes
;
}
EXPORT_SYMBOL
(
_copy_to_iter
);
#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
static
int
copyout_mcsafe
(
void
__user
*
to
,
const
void
*
from
,
size_t
n
)
{
if
(
access_ok
(
VERIFY_WRITE
,
to
,
n
))
{
kasan_check_read
(
from
,
n
);
n
=
copy_to_user_mcsafe
((
__force
void
*
)
to
,
from
,
n
);
}
return
n
;
}
static
unsigned
long
memcpy_mcsafe_to_page
(
struct
page
*
page
,
size_t
offset
,
const
char
*
from
,
size_t
len
)
{
unsigned
long
ret
;
char
*
to
;
to
=
kmap_atomic
(
page
);
ret
=
memcpy_mcsafe
(
to
+
offset
,
from
,
len
);
kunmap_atomic
(
to
);
return
ret
;
}
static
size_t
copy_pipe_to_iter_mcsafe
(
const
void
*
addr
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
struct
pipe_inode_info
*
pipe
=
i
->
pipe
;
size_t
n
,
off
,
xfer
=
0
;
int
idx
;
if
(
!
sanity
(
i
))
return
0
;
bytes
=
n
=
push_pipe
(
i
,
bytes
,
&
idx
,
&
off
);
if
(
unlikely
(
!
n
))
return
0
;
for
(
;
n
;
idx
=
next_idx
(
idx
,
pipe
),
off
=
0
)
{
size_t
chunk
=
min_t
(
size_t
,
n
,
PAGE_SIZE
-
off
);
unsigned
long
rem
;
rem
=
memcpy_mcsafe_to_page
(
pipe
->
bufs
[
idx
].
page
,
off
,
addr
,
chunk
);
i
->
idx
=
idx
;
i
->
iov_offset
=
off
+
chunk
-
rem
;
xfer
+=
chunk
-
rem
;
if
(
rem
)
break
;
n
-=
chunk
;
addr
+=
chunk
;
}
i
->
count
-=
xfer
;
return
xfer
;
}
/**
* _copy_to_iter_mcsafe - copy to user with source-read error exception handling
* @addr: source kernel address
* @bytes: total transfer length
* @iter: destination iterator
*
* The pmem driver arranges for filesystem-dax to use this facility via
* dax_copy_to_iter() for protecting read/write to persistent memory.
* Unless / until an architecture can guarantee identical performance
* between _copy_to_iter_mcsafe() and _copy_to_iter() it would be a
* performance regression to switch more users to the mcsafe version.
*
* Otherwise, the main differences between this and typical _copy_to_iter().
*
* * Typical tail/residue handling after a fault retries the copy
* byte-by-byte until the fault happens again. Re-triggering machine
* checks is potentially fatal so the implementation uses source
* alignment and poison alignment assumptions to avoid re-triggering
* hardware exceptions.
*
* * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
* Compare to copy_to_iter() where only ITER_IOVEC attempts might return
* a short copy.
*
* See MCSAFE_TEST for self-test.
*/
size_t
_copy_to_iter_mcsafe
(
const
void
*
addr
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
const
char
*
from
=
addr
;
unsigned
long
rem
,
curr_addr
,
s_addr
=
(
unsigned
long
)
addr
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
return
copy_pipe_to_iter_mcsafe
(
addr
,
bytes
,
i
);
if
(
iter_is_iovec
(
i
))
might_fault
();
iterate_and_advance
(
i
,
bytes
,
v
,
copyout_mcsafe
(
v
.
iov_base
,
(
from
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_len
),
({
rem
=
memcpy_mcsafe_to_page
(
v
.
bv_page
,
v
.
bv_offset
,
(
from
+=
v
.
bv_len
)
-
v
.
bv_len
,
v
.
bv_len
);
if
(
rem
)
{
curr_addr
=
(
unsigned
long
)
from
;
bytes
=
curr_addr
-
s_addr
-
rem
;
return
bytes
;
}
}),
({
rem
=
memcpy_mcsafe
(
v
.
iov_base
,
(
from
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_len
);
if
(
rem
)
{
curr_addr
=
(
unsigned
long
)
from
;
bytes
=
curr_addr
-
s_addr
-
rem
;
return
bytes
;
}
})
)
return
bytes
;
}
EXPORT_SYMBOL_GPL
(
_copy_to_iter_mcsafe
);
#endif
/* CONFIG_ARCH_HAS_UACCESS_MCSAFE */
size_t
_copy_from_iter
(
void
*
addr
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
char
*
to
=
addr
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
WARN_ON
(
1
);
return
0
;
}
if
(
iter_is_iovec
(
i
))
might_fault
();
iterate_and_advance
(
i
,
bytes
,
v
,
copyin
((
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
),
memcpy_from_page
((
to
+=
v
.
bv_len
)
-
v
.
bv_len
,
v
.
bv_page
,
v
.
bv_offset
,
v
.
bv_len
),
memcpy
((
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
)
)
return
bytes
;
}
EXPORT_SYMBOL
(
_copy_from_iter
);
bool
_copy_from_iter_full
(
void
*
addr
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
char
*
to
=
addr
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
WARN_ON
(
1
);
return
false
;
}
if
(
unlikely
(
i
->
count
<
bytes
))
return
false
;
if
(
iter_is_iovec
(
i
))
might_fault
();
iterate_all_kinds
(
i
,
bytes
,
v
,
({
if
(
copyin
((
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
))
return
false
;
0
;}),
memcpy_from_page
((
to
+=
v
.
bv_len
)
-
v
.
bv_len
,
v
.
bv_page
,
v
.
bv_offset
,
v
.
bv_len
),
memcpy
((
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
)
)
iov_iter_advance
(
i
,
bytes
);
return
true
;
}
EXPORT_SYMBOL
(
_copy_from_iter_full
);
size_t
_copy_from_iter_nocache
(
void
*
addr
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
char
*
to
=
addr
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
WARN_ON
(
1
);
return
0
;
}
iterate_and_advance
(
i
,
bytes
,
v
,
__copy_from_user_inatomic_nocache
((
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
),
memcpy_from_page
((
to
+=
v
.
bv_len
)
-
v
.
bv_len
,
v
.
bv_page
,
v
.
bv_offset
,
v
.
bv_len
),
memcpy
((
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
)
)
return
bytes
;
}
EXPORT_SYMBOL
(
_copy_from_iter_nocache
);
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/**
* _copy_from_iter_flushcache - write destination through cpu cache
* @addr: destination kernel address
* @bytes: total transfer length
* @iter: source iterator
*
* The pmem driver arranges for filesystem-dax to use this facility via
* dax_copy_from_iter() for ensuring that writes to persistent memory
* are flushed through the CPU cache. It is differentiated from
* _copy_from_iter_nocache() in that guarantees all data is flushed for
* all iterator types. The _copy_from_iter_nocache() only attempts to
* bypass the cache for the ITER_IOVEC case, and on some archs may use
* instructions that strand dirty-data in the cache.
*/
size_t
_copy_from_iter_flushcache
(
void
*
addr
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
char
*
to
=
addr
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
WARN_ON
(
1
);
return
0
;
}
iterate_and_advance
(
i
,
bytes
,
v
,
__copy_from_user_flushcache
((
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
),
memcpy_page_flushcache
((
to
+=
v
.
bv_len
)
-
v
.
bv_len
,
v
.
bv_page
,
v
.
bv_offset
,
v
.
bv_len
),
memcpy_flushcache
((
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
)
)
return
bytes
;
}
EXPORT_SYMBOL_GPL
(
_copy_from_iter_flushcache
);
#endif
bool
_copy_from_iter_full_nocache
(
void
*
addr
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
char
*
to
=
addr
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
WARN_ON
(
1
);
return
false
;
}
if
(
unlikely
(
i
->
count
<
bytes
))
return
false
;
iterate_all_kinds
(
i
,
bytes
,
v
,
({
if
(
__copy_from_user_inatomic_nocache
((
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
))
return
false
;
0
;}),
memcpy_from_page
((
to
+=
v
.
bv_len
)
-
v
.
bv_len
,
v
.
bv_page
,
v
.
bv_offset
,
v
.
bv_len
),
memcpy
((
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
)
)
iov_iter_advance
(
i
,
bytes
);
return
true
;
}
EXPORT_SYMBOL
(
_copy_from_iter_full_nocache
);
static
inline
bool
page_copy_sane
(
struct
page
*
page
,
size_t
offset
,
size_t
n
)
{
struct
page
*
head
=
compound_head
(
page
);
size_t
v
=
n
+
offset
+
page_address
(
page
)
-
page_address
(
head
);
if
(
likely
(
n
<=
v
&&
v
<=
(
PAGE_SIZE
<<
compound_order
(
head
))))
return
true
;
WARN_ON
(
1
);
return
false
;
}
size_t
copy_page_to_iter
(
struct
page
*
page
,
size_t
offset
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
if
(
unlikely
(
!
page_copy_sane
(
page
,
offset
,
bytes
)))
return
0
;
if
(
i
->
type
&
(
ITER_BVEC
|
ITER_KVEC
))
{
void
*
kaddr
=
kmap_atomic
(
page
);
size_t
wanted
=
copy_to_iter
(
kaddr
+
offset
,
bytes
,
i
);
kunmap_atomic
(
kaddr
);
return
wanted
;
}
else
if
(
likely
(
!
(
i
->
type
&
ITER_PIPE
)))
return
copy_page_to_iter_iovec
(
page
,
offset
,
bytes
,
i
);
else
return
copy_page_to_iter_pipe
(
page
,
offset
,
bytes
,
i
);
}
EXPORT_SYMBOL
(
copy_page_to_iter
);
size_t
copy_page_from_iter
(
struct
page
*
page
,
size_t
offset
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
if
(
unlikely
(
!
page_copy_sane
(
page
,
offset
,
bytes
)))
return
0
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
WARN_ON
(
1
);
return
0
;
}
if
(
i
->
type
&
(
ITER_BVEC
|
ITER_KVEC
))
{
void
*
kaddr
=
kmap_atomic
(
page
);
size_t
wanted
=
_copy_from_iter
(
kaddr
+
offset
,
bytes
,
i
);
kunmap_atomic
(
kaddr
);
return
wanted
;
}
else
return
copy_page_from_iter_iovec
(
page
,
offset
,
bytes
,
i
);
}
EXPORT_SYMBOL
(
copy_page_from_iter
);
static
size_t
pipe_zero
(
size_t
bytes
,
struct
iov_iter
*
i
)
{
struct
pipe_inode_info
*
pipe
=
i
->
pipe
;
size_t
n
,
off
;
int
idx
;
if
(
!
sanity
(
i
))
return
0
;
bytes
=
n
=
push_pipe
(
i
,
bytes
,
&
idx
,
&
off
);
if
(
unlikely
(
!
n
))
return
0
;
for
(
;
n
;
idx
=
next_idx
(
idx
,
pipe
),
off
=
0
)
{
size_t
chunk
=
min_t
(
size_t
,
n
,
PAGE_SIZE
-
off
);
memzero_page
(
pipe
->
bufs
[
idx
].
page
,
off
,
chunk
);
i
->
idx
=
idx
;
i
->
iov_offset
=
off
+
chunk
;
n
-=
chunk
;
}
i
->
count
-=
bytes
;
return
bytes
;
}
size_t
iov_iter_zero
(
size_t
bytes
,
struct
iov_iter
*
i
)
{
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
return
pipe_zero
(
bytes
,
i
);
iterate_and_advance
(
i
,
bytes
,
v
,
clear_user
(
v
.
iov_base
,
v
.
iov_len
),
memzero_page
(
v
.
bv_page
,
v
.
bv_offset
,
v
.
bv_len
),
memset
(
v
.
iov_base
,
0
,
v
.
iov_len
)
)
return
bytes
;
}
EXPORT_SYMBOL
(
iov_iter_zero
);
size_t
iov_iter_copy_from_user_atomic
(
struct
page
*
page
,
struct
iov_iter
*
i
,
unsigned
long
offset
,
size_t
bytes
)
{
char
*
kaddr
=
kmap_atomic
(
page
),
*
p
=
kaddr
+
offset
;
if
(
unlikely
(
!
page_copy_sane
(
page
,
offset
,
bytes
)))
{
kunmap_atomic
(
kaddr
);
return
0
;
}
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
kunmap_atomic
(
kaddr
);
WARN_ON
(
1
);
return
0
;
}
iterate_all_kinds
(
i
,
bytes
,
v
,
copyin
((
p
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
),
memcpy_from_page
((
p
+=
v
.
bv_len
)
-
v
.
bv_len
,
v
.
bv_page
,
v
.
bv_offset
,
v
.
bv_len
),
memcpy
((
p
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
)
)
kunmap_atomic
(
kaddr
);
return
bytes
;
}
EXPORT_SYMBOL
(
iov_iter_copy_from_user_atomic
);
static
inline
void
pipe_truncate
(
struct
iov_iter
*
i
)
{
struct
pipe_inode_info
*
pipe
=
i
->
pipe
;
if
(
pipe
->
nrbufs
)
{
size_t
off
=
i
->
iov_offset
;
int
idx
=
i
->
idx
;
int
nrbufs
=
(
idx
-
pipe
->
curbuf
)
&
(
pipe
->
buffers
-
1
);
if
(
off
)
{
pipe
->
bufs
[
idx
].
len
=
off
-
pipe
->
bufs
[
idx
].
offset
;
idx
=
next_idx
(
idx
,
pipe
);
nrbufs
++
;
}
while
(
pipe
->
nrbufs
>
nrbufs
)
{
pipe_buf_release
(
pipe
,
&
pipe
->
bufs
[
idx
]);
idx
=
next_idx
(
idx
,
pipe
);
pipe
->
nrbufs
--
;
}
}
}
static
void
pipe_advance
(
struct
iov_iter
*
i
,
size_t
size
)
{
struct
pipe_inode_info
*
pipe
=
i
->
pipe
;
if
(
unlikely
(
i
->
count
<
size
))
size
=
i
->
count
;
if
(
size
)
{
struct
pipe_buffer
*
buf
;
size_t
off
=
i
->
iov_offset
,
left
=
size
;
int
idx
=
i
->
idx
;
if
(
off
)
/* make it relative to the beginning of buffer */
left
+=
off
-
pipe
->
bufs
[
idx
].
offset
;
while
(
1
)
{
buf
=
&
pipe
->
bufs
[
idx
];
if
(
left
<=
buf
->
len
)
break
;
left
-=
buf
->
len
;
idx
=
next_idx
(
idx
,
pipe
);
}
i
->
idx
=
idx
;
i
->
iov_offset
=
buf
->
offset
+
left
;
}
i
->
count
-=
size
;
/* ... and discard everything past that point */
pipe_truncate
(
i
);
}
void
iov_iter_advance
(
struct
iov_iter
*
i
,
size_t
size
)
{
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
pipe_advance
(
i
,
size
);
return
;
}
iterate_and_advance
(
i
,
size
,
v
,
0
,
0
,
0
)
}
EXPORT_SYMBOL
(
iov_iter_advance
);
void
iov_iter_revert
(
struct
iov_iter
*
i
,
size_t
unroll
)
{
if
(
!
unroll
)
return
;
if
(
WARN_ON
(
unroll
>
MAX_RW_COUNT
))
return
;
i
->
count
+=
unroll
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
struct
pipe_inode_info
*
pipe
=
i
->
pipe
;
int
idx
=
i
->
idx
;
size_t
off
=
i
->
iov_offset
;
while
(
1
)
{
size_t
n
=
off
-
pipe
->
bufs
[
idx
].
offset
;
if
(
unroll
<
n
)
{
off
-=
unroll
;
break
;
}
unroll
-=
n
;
if
(
!
unroll
&&
idx
==
i
->
start_idx
)
{
off
=
0
;
break
;
}
if
(
!
idx
--
)
idx
=
pipe
->
buffers
-
1
;
off
=
pipe
->
bufs
[
idx
].
offset
+
pipe
->
bufs
[
idx
].
len
;
}
i
->
iov_offset
=
off
;
i
->
idx
=
idx
;
pipe_truncate
(
i
);
return
;
}
if
(
unroll
<=
i
->
iov_offset
)
{
i
->
iov_offset
-=
unroll
;
return
;
}
unroll
-=
i
->
iov_offset
;
if
(
i
->
type
&
ITER_BVEC
)
{
const
struct
bio_vec
*
bvec
=
i
->
bvec
;
while
(
1
)
{
size_t
n
=
(
--
bvec
)
->
bv_len
;
i
->
nr_segs
++
;
if
(
unroll
<=
n
)
{
i
->
bvec
=
bvec
;
i
->
iov_offset
=
n
-
unroll
;
return
;
}
unroll
-=
n
;
}
}
else
{
/* same logics for iovec and kvec */
const
struct
iovec
*
iov
=
i
->
iov
;
while
(
1
)
{
size_t
n
=
(
--
iov
)
->
iov_len
;
i
->
nr_segs
++
;
if
(
unroll
<=
n
)
{
i
->
iov
=
iov
;
i
->
iov_offset
=
n
-
unroll
;
return
;
}
unroll
-=
n
;
}
}
}
EXPORT_SYMBOL
(
iov_iter_revert
);
/*
* Return the count of just the current iov_iter segment.
*/
size_t
iov_iter_single_seg_count
(
const
struct
iov_iter
*
i
)
{
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
return
i
->
count
;
// it is a silly place, anyway
if
(
i
->
nr_segs
==
1
)
return
i
->
count
;
else
if
(
i
->
type
&
ITER_BVEC
)
return
min
(
i
->
count
,
i
->
bvec
->
bv_len
-
i
->
iov_offset
);
else
return
min
(
i
->
count
,
i
->
iov
->
iov_len
-
i
->
iov_offset
);
}
EXPORT_SYMBOL
(
iov_iter_single_seg_count
);
void
iov_iter_kvec
(
struct
iov_iter
*
i
,
int
direction
,
const
struct
kvec
*
kvec
,
unsigned
long
nr_segs
,
size_t
count
)
{
BUG_ON
(
!
(
direction
&
ITER_KVEC
));
i
->
type
=
direction
;
i
->
kvec
=
kvec
;
i
->
nr_segs
=
nr_segs
;
i
->
iov_offset
=
0
;
i
->
count
=
count
;
}
EXPORT_SYMBOL
(
iov_iter_kvec
);
void
iov_iter_bvec
(
struct
iov_iter
*
i
,
int
direction
,
const
struct
bio_vec
*
bvec
,
unsigned
long
nr_segs
,
size_t
count
)
{
BUG_ON
(
!
(
direction
&
ITER_BVEC
));
i
->
type
=
direction
;
i
->
bvec
=
bvec
;
i
->
nr_segs
=
nr_segs
;
i
->
iov_offset
=
0
;
i
->
count
=
count
;
}
EXPORT_SYMBOL
(
iov_iter_bvec
);
void
iov_iter_pipe
(
struct
iov_iter
*
i
,
int
direction
,
struct
pipe_inode_info
*
pipe
,
size_t
count
)
{
BUG_ON
(
direction
!=
ITER_PIPE
);
WARN_ON
(
pipe
->
nrbufs
==
pipe
->
buffers
);
i
->
type
=
direction
;
i
->
pipe
=
pipe
;
i
->
idx
=
(
pipe
->
curbuf
+
pipe
->
nrbufs
)
&
(
pipe
->
buffers
-
1
);
i
->
iov_offset
=
0
;
i
->
count
=
count
;
i
->
start_idx
=
i
->
idx
;
}
EXPORT_SYMBOL
(
iov_iter_pipe
);
unsigned
long
iov_iter_alignment
(
const
struct
iov_iter
*
i
)
{
unsigned
long
res
=
0
;
size_t
size
=
i
->
count
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
if
(
size
&&
i
->
iov_offset
&&
allocated
(
&
i
->
pipe
->
bufs
[
i
->
idx
]))
return
size
|
i
->
iov_offset
;
return
size
;
}
iterate_all_kinds
(
i
,
size
,
v
,
(
res
|=
(
unsigned
long
)
v
.
iov_base
|
v
.
iov_len
,
0
),
res
|=
v
.
bv_offset
|
v
.
bv_len
,
res
|=
(
unsigned
long
)
v
.
iov_base
|
v
.
iov_len
)
return
res
;
}
EXPORT_SYMBOL
(
iov_iter_alignment
);
unsigned
long
iov_iter_gap_alignment
(
const
struct
iov_iter
*
i
)
{
unsigned
long
res
=
0
;
size_t
size
=
i
->
count
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
WARN_ON
(
1
);
return
~
0U
;
}
iterate_all_kinds
(
i
,
size
,
v
,
(
res
|=
(
!
res
?
0
:
(
unsigned
long
)
v
.
iov_base
)
|
(
size
!=
v
.
iov_len
?
size
:
0
),
0
),
(
res
|=
(
!
res
?
0
:
(
unsigned
long
)
v
.
bv_offset
)
|
(
size
!=
v
.
bv_len
?
size
:
0
)),
(
res
|=
(
!
res
?
0
:
(
unsigned
long
)
v
.
iov_base
)
|
(
size
!=
v
.
iov_len
?
size
:
0
))
);
return
res
;
}
EXPORT_SYMBOL
(
iov_iter_gap_alignment
);
static
inline
ssize_t
__pipe_get_pages
(
struct
iov_iter
*
i
,
size_t
maxsize
,
struct
page
**
pages
,
int
idx
,
size_t
*
start
)
{
struct
pipe_inode_info
*
pipe
=
i
->
pipe
;
ssize_t
n
=
push_pipe
(
i
,
maxsize
,
&
idx
,
start
);
if
(
!
n
)
return
-
EFAULT
;
maxsize
=
n
;
n
+=
*
start
;
while
(
n
>
0
)
{
get_page
(
*
pages
++
=
pipe
->
bufs
[
idx
].
page
);
idx
=
next_idx
(
idx
,
pipe
);
n
-=
PAGE_SIZE
;
}
return
maxsize
;
}
static
ssize_t
pipe_get_pages
(
struct
iov_iter
*
i
,
struct
page
**
pages
,
size_t
maxsize
,
unsigned
maxpages
,
size_t
*
start
)
{
unsigned
npages
;
size_t
capacity
;
int
idx
;
if
(
!
maxsize
)
return
0
;
if
(
!
sanity
(
i
))
return
-
EFAULT
;
data_start
(
i
,
&
idx
,
start
);
/* some of this one + all after this one */
npages
=
((
i
->
pipe
->
curbuf
-
idx
-
1
)
&
(
i
->
pipe
->
buffers
-
1
))
+
1
;
capacity
=
min
(
npages
,
maxpages
)
*
PAGE_SIZE
-
*
start
;
return
__pipe_get_pages
(
i
,
min
(
maxsize
,
capacity
),
pages
,
idx
,
start
);
}
ssize_t
iov_iter_get_pages
(
struct
iov_iter
*
i
,
struct
page
**
pages
,
size_t
maxsize
,
unsigned
maxpages
,
size_t
*
start
)
{
if
(
maxsize
>
i
->
count
)
maxsize
=
i
->
count
;
pr_debug
(
"maxsize = %d, maxpages=%d, start (outut) = 0x%x "
,
maxsize
,
maxpages
,
*
start
);
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
return
pipe_get_pages
(
i
,
pages
,
maxsize
,
maxpages
,
start
);
iterate_all_kinds
(
i
,
maxsize
,
v
,
({
unsigned
long
addr
=
(
unsigned
long
)
v
.
iov_base
;
size_t
len
=
v
.
iov_len
+
(
*
start
=
addr
&
(
PAGE_SIZE
-
1
));
int
n
;
int
res
;
if
(
len
>
maxpages
*
PAGE_SIZE
)
len
=
maxpages
*
PAGE_SIZE
;
addr
&=
~
(
PAGE_SIZE
-
1
);
n
=
DIV_ROUND_UP
(
len
,
PAGE_SIZE
);
res
=
get_user_pages_fast
(
addr
,
n
,
(
i
->
type
&
WRITE
)
!=
WRITE
,
pages
);
if
(
unlikely
(
res
<
0
))
return
res
;
return
(
res
==
n
?
len
:
res
*
PAGE_SIZE
)
-
*
start
;
0
;}),({
/* can't be more than PAGE_SIZE */
*
start
=
v
.
bv_offset
;
get_page
(
*
pages
=
v
.
bv_page
);
return
v
.
bv_len
;
}),({
return
-
EFAULT
;
})
)
return
0
;
}
EXPORT_SYMBOL
(
iov_iter_get_pages
);
static
struct
page
**
get_pages_array
(
size_t
n
)
{
return
kvmalloc_array
(
n
,
sizeof
(
struct
page
*
),
GFP_KERNEL
);
}
static
ssize_t
pipe_get_pages_alloc
(
struct
iov_iter
*
i
,
struct
page
***
pages
,
size_t
maxsize
,
size_t
*
start
)
{
struct
page
**
p
;
ssize_t
n
;
int
idx
;
int
npages
;
if
(
!
maxsize
)
return
0
;
if
(
!
sanity
(
i
))
return
-
EFAULT
;
data_start
(
i
,
&
idx
,
start
);
/* some of this one + all after this one */
npages
=
((
i
->
pipe
->
curbuf
-
idx
-
1
)
&
(
i
->
pipe
->
buffers
-
1
))
+
1
;
n
=
npages
*
PAGE_SIZE
-
*
start
;
if
(
maxsize
>
n
)
maxsize
=
n
;
else
npages
=
DIV_ROUND_UP
(
maxsize
+
*
start
,
PAGE_SIZE
);
p
=
get_pages_array
(
npages
);
if
(
!
p
)
return
-
ENOMEM
;
n
=
__pipe_get_pages
(
i
,
maxsize
,
p
,
idx
,
start
);
if
(
n
>
0
)
*
pages
=
p
;
else
kvfree
(
p
);
return
n
;
}
ssize_t
iov_iter_get_pages_alloc
(
struct
iov_iter
*
i
,
struct
page
***
pages
,
size_t
maxsize
,
size_t
*
start
)
{
struct
page
**
p
;
if
(
maxsize
>
i
->
count
)
maxsize
=
i
->
count
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
return
pipe_get_pages_alloc
(
i
,
pages
,
maxsize
,
start
);
iterate_all_kinds
(
i
,
maxsize
,
v
,
({
unsigned
long
addr
=
(
unsigned
long
)
v
.
iov_base
;
size_t
len
=
v
.
iov_len
+
(
*
start
=
addr
&
(
PAGE_SIZE
-
1
));
int
n
;
int
res
;
addr
&=
~
(
PAGE_SIZE
-
1
);
n
=
DIV_ROUND_UP
(
len
,
PAGE_SIZE
);
p
=
get_pages_array
(
n
);
if
(
!
p
)
return
-
ENOMEM
;
res
=
get_user_pages_fast
(
addr
,
n
,
(
i
->
type
&
WRITE
)
!=
WRITE
,
p
);
if
(
unlikely
(
res
<
0
))
{
kvfree
(
p
);
return
res
;
}
*
pages
=
p
;
return
(
res
==
n
?
len
:
res
*
PAGE_SIZE
)
-
*
start
;
0
;}),({
/* can't be more than PAGE_SIZE */
*
start
=
v
.
bv_offset
;
*
pages
=
p
=
get_pages_array
(
1
);
if
(
!
p
)
return
-
ENOMEM
;
get_page
(
*
p
=
v
.
bv_page
);
return
v
.
bv_len
;
}),({
return
-
EFAULT
;
})
)
return
0
;
}
EXPORT_SYMBOL
(
iov_iter_get_pages_alloc
);
size_t
csum_and_copy_from_iter
(
void
*
addr
,
size_t
bytes
,
__wsum
*
csum
,
struct
iov_iter
*
i
)
{
char
*
to
=
addr
;
__wsum
sum
,
next
;
size_t
off
=
0
;
sum
=
*
csum
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
WARN_ON
(
1
);
return
0
;
}
iterate_and_advance
(
i
,
bytes
,
v
,
({
int
err
=
0
;
next
=
csum_and_copy_from_user
(
v
.
iov_base
,
(
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_len
,
0
,
&
err
);
if
(
!
err
)
{
sum
=
csum_block_add
(
sum
,
next
,
off
);
off
+=
v
.
iov_len
;
}
err
?
v
.
iov_len
:
0
;
}),
({
char
*
p
=
kmap_atomic
(
v
.
bv_page
);
next
=
csum_partial_copy_nocheck
(
p
+
v
.
bv_offset
,
(
to
+=
v
.
bv_len
)
-
v
.
bv_len
,
v
.
bv_len
,
0
);
kunmap_atomic
(
p
);
sum
=
csum_block_add
(
sum
,
next
,
off
);
off
+=
v
.
bv_len
;
}),({
next
=
csum_partial_copy_nocheck
(
v
.
iov_base
,
(
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_len
,
0
);
sum
=
csum_block_add
(
sum
,
next
,
off
);
off
+=
v
.
iov_len
;
})
)
*
csum
=
sum
;
return
bytes
;
}
EXPORT_SYMBOL
(
csum_and_copy_from_iter
);
bool
csum_and_copy_from_iter_full
(
void
*
addr
,
size_t
bytes
,
__wsum
*
csum
,
struct
iov_iter
*
i
)
{
char
*
to
=
addr
;
__wsum
sum
,
next
;
size_t
off
=
0
;
sum
=
*
csum
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
WARN_ON
(
1
);
return
false
;
}
if
(
unlikely
(
i
->
count
<
bytes
))
return
false
;
iterate_all_kinds
(
i
,
bytes
,
v
,
({
int
err
=
0
;
next
=
csum_and_copy_from_user
(
v
.
iov_base
,
(
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_len
,
0
,
&
err
);
if
(
err
)
return
false
;
sum
=
csum_block_add
(
sum
,
next
,
off
);
off
+=
v
.
iov_len
;
0
;
}),
({
char
*
p
=
kmap_atomic
(
v
.
bv_page
);
next
=
csum_partial_copy_nocheck
(
p
+
v
.
bv_offset
,
(
to
+=
v
.
bv_len
)
-
v
.
bv_len
,
v
.
bv_len
,
0
);
kunmap_atomic
(
p
);
sum
=
csum_block_add
(
sum
,
next
,
off
);
off
+=
v
.
bv_len
;
}),({
next
=
csum_partial_copy_nocheck
(
v
.
iov_base
,
(
to
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_len
,
0
);
sum
=
csum_block_add
(
sum
,
next
,
off
);
off
+=
v
.
iov_len
;
})
)
*
csum
=
sum
;
iov_iter_advance
(
i
,
bytes
);
return
true
;
}
EXPORT_SYMBOL
(
csum_and_copy_from_iter_full
);
size_t
csum_and_copy_to_iter
(
const
void
*
addr
,
size_t
bytes
,
__wsum
*
csum
,
struct
iov_iter
*
i
)
{
const
char
*
from
=
addr
;
__wsum
sum
,
next
;
size_t
off
=
0
;
sum
=
*
csum
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
WARN_ON
(
1
);
/* for now */
return
0
;
}
iterate_and_advance
(
i
,
bytes
,
v
,
({
int
err
=
0
;
next
=
csum_and_copy_to_user
((
from
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
,
0
,
&
err
);
if
(
!
err
)
{
sum
=
csum_block_add
(
sum
,
next
,
off
);
off
+=
v
.
iov_len
;
}
err
?
v
.
iov_len
:
0
;
}),
({
char
*
p
=
kmap_atomic
(
v
.
bv_page
);
next
=
csum_partial_copy_nocheck
((
from
+=
v
.
bv_len
)
-
v
.
bv_len
,
p
+
v
.
bv_offset
,
v
.
bv_len
,
0
);
kunmap_atomic
(
p
);
sum
=
csum_block_add
(
sum
,
next
,
off
);
off
+=
v
.
bv_len
;
}),({
next
=
csum_partial_copy_nocheck
((
from
+=
v
.
iov_len
)
-
v
.
iov_len
,
v
.
iov_base
,
v
.
iov_len
,
0
);
sum
=
csum_block_add
(
sum
,
next
,
off
);
off
+=
v
.
iov_len
;
})
)
*
csum
=
sum
;
return
bytes
;
}
EXPORT_SYMBOL
(
csum_and_copy_to_iter
);
int
iov_iter_npages
(
const
struct
iov_iter
*
i
,
int
maxpages
)
{
size_t
size
=
i
->
count
;
int
npages
=
0
;
if
(
!
size
)
return
0
;
if
(
unlikely
(
i
->
type
&
ITER_PIPE
))
{
struct
pipe_inode_info
*
pipe
=
i
->
pipe
;
size_t
off
;
int
idx
;
if
(
!
sanity
(
i
))
return
0
;
data_start
(
i
,
&
idx
,
&
off
);
/* some of this one + all after this one */
npages
=
((
pipe
->
curbuf
-
idx
-
1
)
&
(
pipe
->
buffers
-
1
))
+
1
;
if
(
npages
>=
maxpages
)
return
maxpages
;
}
else
iterate_all_kinds
(
i
,
size
,
v
,
({
unsigned
long
p
=
(
unsigned
long
)
v
.
iov_base
;
npages
+=
DIV_ROUND_UP
(
p
+
v
.
iov_len
,
PAGE_SIZE
)
-
p
/
PAGE_SIZE
;
if
(
npages
>=
maxpages
)
return
maxpages
;
0
;}),({
npages
++
;
if
(
npages
>=
maxpages
)
return
maxpages
;
}),({
unsigned
long
p
=
(
unsigned
long
)
v
.
iov_base
;
npages
+=
DIV_ROUND_UP
(
p
+
v
.
iov_len
,
PAGE_SIZE
)
-
p
/
PAGE_SIZE
;
if
(
npages
>=
maxpages
)
return
maxpages
;
})
)
return
npages
;
}
EXPORT_SYMBOL
(
iov_iter_npages
);
const
void
*
dup_iter
(
struct
iov_iter
*
new
,
struct
iov_iter
*
old
,
gfp_t
flags
)
{
*
new
=
*
old
;
if
(
unlikely
(
new
->
type
&
ITER_PIPE
))
{
WARN_ON
(
1
);
return
NULL
;
}
if
(
new
->
type
&
ITER_BVEC
)
return
new
->
bvec
=
kmemdup
(
new
->
bvec
,
new
->
nr_segs
*
sizeof
(
struct
bio_vec
),
flags
);
else
/* iovec and kvec have identical layout */
return
new
->
iov
=
kmemdup
(
new
->
iov
,
new
->
nr_segs
*
sizeof
(
struct
iovec
),
flags
);
}
EXPORT_SYMBOL
(
dup_iter
);
/**
* import_iovec() - Copy an array of &struct iovec from userspace
* into the kernel, check that it is valid, and initialize a new
* &struct iov_iter iterator to access it.
*
* @type: One of %READ or %WRITE.
* @uvector: Pointer to the userspace array.
* @nr_segs: Number of elements in userspace array.
* @fast_segs: Number of elements in @iov.
* @iov: (input and output parameter) Pointer to pointer to (usually small
* on-stack) kernel array.
* @i: Pointer to iterator that will be initialized on success.
*
* If the array pointed to by *@iov is large enough to hold all @nr_segs,
* then this function places %NULL in *@iov on return. Otherwise, a new
* array will be allocated and the result placed in *@iov. This means that
* the caller may call kfree() on *@iov regardless of whether the small
* on-stack array was used or not (and regardless of whether this function
* returns an error or not).
*
* Return: 0 on success or negative error code on error.
*/
int
import_iovec
(
int
type
,
const
struct
iovec
__user
*
uvector
,
unsigned
nr_segs
,
unsigned
fast_segs
,
struct
iovec
**
iov
,
struct
iov_iter
*
i
)
{
ssize_t
n
;
struct
iovec
*
p
;
n
=
rw_copy_check_uvector
(
type
,
uvector
,
nr_segs
,
fast_segs
,
*
iov
,
&
p
);
if
(
n
<
0
)
{
if
(
p
!=
*
iov
)
kfree
(
p
);
*
iov
=
NULL
;
return
n
;
}
iov_iter_init
(
i
,
type
,
p
,
nr_segs
,
n
);
*
iov
=
p
==
*
iov
?
NULL
:
p
;
return
0
;
}
EXPORT_SYMBOL
(
import_iovec
);
#ifdef CONFIG_COMPAT
#include <linux/compat.h>
int
compat_import_iovec
(
int
type
,
const
struct
compat_iovec
__user
*
uvector
,
unsigned
nr_segs
,
unsigned
fast_segs
,
struct
iovec
**
iov
,
struct
iov_iter
*
i
)
{
ssize_t
n
;
struct
iovec
*
p
;
n
=
compat_rw_copy_check_uvector
(
type
,
uvector
,
nr_segs
,
fast_segs
,
*
iov
,
&
p
);
if
(
n
<
0
)
{
if
(
p
!=
*
iov
)
kfree
(
p
);
*
iov
=
NULL
;
return
n
;
}
iov_iter_init
(
i
,
type
,
p
,
nr_segs
,
n
);
*
iov
=
p
==
*
iov
?
NULL
:
p
;
return
0
;
}
#endif
int
import_single_range
(
int
rw
,
void
__user
*
buf
,
size_t
len
,
struct
iovec
*
iov
,
struct
iov_iter
*
i
)
{
if
(
len
>
MAX_RW_COUNT
)
len
=
MAX_RW_COUNT
;
if
(
unlikely
(
!
access_ok
(
!
rw
,
buf
,
len
)))
return
-
EFAULT
;
iov
->
iov_base
=
buf
;
iov
->
iov_len
=
len
;
iov_iter_init
(
i
,
rw
,
iov
,
1
,
len
);
return
0
;
}
EXPORT_SYMBOL
(
import_single_range
);
int
iov_iter_for_each_range
(
struct
iov_iter
*
i
,
size_t
bytes
,
int
(
*
f
)(
struct
kvec
*
vec
,
void
*
context
),
void
*
context
)
{
struct
kvec
w
;
int
err
=
-
EINVAL
;
if
(
!
bytes
)
return
0
;
iterate_all_kinds
(
i
,
bytes
,
v
,
-
EINVAL
,
({
w
.
iov_base
=
kmap
(
v
.
bv_page
)
+
v
.
bv_offset
;
w
.
iov_len
=
v
.
bv_len
;
err
=
f
(
&
w
,
context
);
kunmap
(
v
.
bv_page
);
err
;}),
({
w
=
v
;
err
=
f
(
&
w
,
context
);})
)
return
err
;
}
EXPORT_SYMBOL
(
iov_iter_for_each_range
);
src/mm/filemap.c
View file @
f1301897
...
@@ -437,13 +437,13 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
...
@@ -437,13 +437,13 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
.
range_start
=
start
,
.
range_start
=
start
,
.
range_end
=
end
,
.
range_end
=
end
,
};
};
pr_debug
(
"mapping = %p, start = %lld, end = %lld, sync_mode=%d"
,
mapping
,
start
,
end
,
sync_mode
);
if
(
!
mapping_cap_writeback_dirty
(
mapping
))
if
(
!
mapping_cap_writeback_dirty
(
mapping
))
return
0
;
return
0
;
wbc_attach_fdatawrite_inode
(
&
wbc
,
mapping
->
host
);
wbc_attach_fdatawrite_inode
(
&
wbc
,
mapping
->
host
);
ret
=
do_writepages
(
mapping
,
&
wbc
);
ret
=
do_writepages
(
mapping
,
&
wbc
);
wbc_detach_inode
(
&
wbc
);
wbc_detach_inode
(
&
wbc
);
pr_debug
(
"ret=%d"
,
ret
);
return
ret
;
return
ret
;
}
}
...
@@ -651,14 +651,16 @@ int filemap_write_and_wait_range(struct address_space *mapping,
...
@@ -651,14 +651,16 @@ int filemap_write_and_wait_range(struct address_space *mapping,
loff_t
lstart
,
loff_t
lend
)
loff_t
lstart
,
loff_t
lend
)
{
{
int
err
=
0
;
int
err
=
0
;
pr_debug
(
"mapping = %p, lstart = %lld, lend = %lld, needs write_back=%d"
,
mapping
,
lstart
,
lend
,
mapping_needs_writeback
(
mapping
));
if
(
mapping_needs_writeback
(
mapping
))
{
if
(
mapping_needs_writeback
(
mapping
))
{
err
=
__filemap_fdatawrite_range
(
mapping
,
lstart
,
lend
,
err
=
__filemap_fdatawrite_range
(
mapping
,
lstart
,
lend
,
WB_SYNC_ALL
);
WB_SYNC_ALL
);
pr_debug
(
"err=%d"
,
err
);
/* See comment of filemap_write_and_wait() */
/* See comment of filemap_write_and_wait() */
if
(
err
!=
-
EIO
)
{
if
(
err
!=
-
EIO
)
{
int
err2
=
filemap_fdatawait_range
(
mapping
,
int
err2
=
filemap_fdatawait_range
(
mapping
,
lstart
,
lend
);
lstart
,
lend
);
pr_debug
(
"err2=%d"
,
err2
);
if
(
!
err
)
if
(
!
err
)
err
=
err2
;
err
=
err2
;
}
else
{
}
else
{
...
@@ -667,6 +669,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
...
@@ -667,6 +669,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
}
}
}
else
{
}
else
{
err
=
filemap_check_errors
(
mapping
);
err
=
filemap_check_errors
(
mapping
);
pr_debug
(
"filemap_check_errors(mapping) -> %d"
,
err
);
}
}
return
err
;
return
err
;
}
}
...
@@ -3005,8 +3008,8 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
...
@@ -3005,8 +3008,8 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t
written
;
ssize_t
written
;
size_t
write_len
;
size_t
write_len
;
pgoff_t
end
;
pgoff_t
end
;
pr_debug
(
"generic_file_direct_write()"
);
write_len
=
iov_iter_count
(
from
);
write_len
=
iov_iter_count
(
from
);
pr_debug
(
"generic_file_direct_write(), pos=%lld, write_len = %d"
,
pos
,
write_len
);
end
=
(
pos
+
write_len
-
1
)
>>
PAGE_SHIFT
;
end
=
(
pos
+
write_len
-
1
)
>>
PAGE_SHIFT
;
if
(
iocb
->
ki_flags
&
IOCB_NOWAIT
)
{
if
(
iocb
->
ki_flags
&
IOCB_NOWAIT
)
{
...
@@ -3017,10 +3020,12 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
...
@@ -3017,10 +3020,12 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
}
else
{
}
else
{
written
=
filemap_write_and_wait_range
(
mapping
,
pos
,
written
=
filemap_write_and_wait_range
(
mapping
,
pos
,
pos
+
write_len
-
1
);
pos
+
write_len
-
1
);
if
(
written
)
if
(
written
)
{
pr_debug
(
"written = %d"
,
written
);
goto
out
;
goto
out
;
}
}
}
pr_debug
(
"not written, written = %d"
,
written
);
/*
/*
* After a write we want buffered reads to be sure to go to disk to get
* After a write we want buffered reads to be sure to go to disk to get
* the new data. We invalidate clean cached page from the region we're
* the new data. We invalidate clean cached page from the region we're
...
@@ -3038,9 +3043,9 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
...
@@ -3038,9 +3043,9 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
return
0
;
return
0
;
goto
out
;
goto
out
;
}
}
pr_debug
(
"Before mapping->a_ops->direct_IO"
);
written
=
mapping
->
a_ops
->
direct_IO
(
iocb
,
from
);
written
=
mapping
->
a_ops
->
direct_IO
(
iocb
,
from
);
pr_debug
(
"After mapping->a_ops->direct_IO, written = %d"
,
written
);
/*
/*
* Finally, try again to invalidate clean pages which might have been
* Finally, try again to invalidate clean pages which might have been
* cached by non-direct readahead, or faulted in by get_user_pages()
* cached by non-direct readahead, or faulted in by get_user_pages()
...
@@ -3225,7 +3230,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
...
@@ -3225,7 +3230,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
}
if
(
iocb
->
ki_flags
&
IOCB_DIRECT
)
{
if
(
iocb
->
ki_flags
&
IOCB_DIRECT
)
{
loff_t
pos
,
endbyte
;
loff_t
pos
,
endbyte
;
pr_debug
(
"using IOCB_DIRECT
"
);
pr_debug
(
"using IOCB_DIRECT
, count = %d, type = %d"
,
iov_iter_count
(
from
),
from
->
type
);
written
=
generic_file_direct_write
(
iocb
,
from
);
written
=
generic_file_direct_write
(
iocb
,
from
);
/*
/*
* If the write stopped short of completing, fall back to
* If the write stopped short of completing, fall back to
...
@@ -3234,8 +3239,11 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
...
@@ -3234,8 +3239,11 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
* not succeed (even if it did, DAX does not handle dirty
* not succeed (even if it did, DAX does not handle dirty
* page-cache pages correctly).
* page-cache pages correctly).
*/
*/
if
(
written
<
0
||
!
iov_iter_count
(
from
)
||
IS_DAX
(
inode
))
if
(
written
<
0
||
!
iov_iter_count
(
from
)
||
IS_DAX
(
inode
)){
pr_debug
(
"IOCB_DIRECT %s: written = %d, iov_iter_count(from)= %d"
,
(
written
<
0
)
?
"ERROR"
:
"OK"
,
written
,
iov_iter_count
(
from
));
goto
out
;
goto
out
;
}
pr_debug
(
"IOCB_DIRECT NEED MORE: written = %d, iov_iter_count(from)= %d"
,
written
,
iov_iter_count
(
from
));
status
=
generic_perform_write
(
file
,
from
,
pos
=
iocb
->
ki_pos
);
status
=
generic_perform_write
(
file
,
from
,
pos
=
iocb
->
ki_pos
);
/*
/*
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment