From 4bdcf62a54681732acb3491fc58aa12e2e746e83 Mon Sep 17 00:00:00 2001 From: Paul Lawrence Date: Wed, 22 Apr 2020 12:39:09 -0700 Subject: [PATCH] ANDROID: Incremental fs: Use simple compression in log buffer Bug: 154342202 Test: incfs_test passes Signed-off-by: Paul Lawrence Change-Id: Ibcc641dd92596018c9f10b5bc7bd0db2642a80c7 (cherry picked from commit b6b4a3a404ccd9c62347e27c4fc7883d776c2cbb) Git-commit: fdd560ec460793a02cf226cc4f2aac6d0e637e5c Git-repo: https://android.googlesource.com/kernel/common/ Signed-off-by: Sayali Lokhande --- fs/incfs/data_mgmt.c | 310 ++++++++++-------- fs/incfs/data_mgmt.h | 76 +++-- fs/incfs/vfs.c | 2 +- .../selftests/filesystems/incfs/incfs_test.c | 89 +++-- 4 files changed, 291 insertions(+), 186 deletions(-) diff --git a/fs/incfs/data_mgmt.c b/fs/incfs/data_mgmt.c index 4ec08eb52461..d9c43d5cca19 100644 --- a/fs/incfs/data_mgmt.c +++ b/fs/incfs/data_mgmt.c @@ -34,8 +34,7 @@ struct mount_info *incfs_alloc_mount_info(struct super_block *sb, mutex_init(&mi->mi_pending_reads_mutex); init_waitqueue_head(&mi->mi_pending_reads_notif_wq); init_waitqueue_head(&mi->mi_log.ml_notif_wq); - rwlock_init(&mi->mi_log.rl_access_lock); - spin_lock_init(&mi->mi_log.rl_logging_lock); + spin_lock_init(&mi->mi_log.rl_lock); INIT_LIST_HEAD(&mi->mi_reads_list_head); error = incfs_realloc_mount_info(mi, options); @@ -53,6 +52,7 @@ int incfs_realloc_mount_info(struct mount_info *mi, struct mount_options *options) { void *new_buffer = NULL; + void *old_buffer; size_t new_buffer_size = 0; if (options->read_log_pages != mi->mi_options.read_log_pages) { @@ -71,16 +71,18 @@ int incfs_realloc_mount_info(struct mount_info *mi, return -ENOMEM; } - write_lock(&mi->mi_log.rl_access_lock); - kfree(mi->mi_log.rl_ring_buf); - WRITE_ONCE(mi->mi_log.rl_ring_buf, new_buffer); - WRITE_ONCE(mi->mi_log.rl_size, - new_buffer_size / sizeof(*mi->mi_log.rl_ring_buf)); - log_state = READ_ONCE(mi->mi_log.rl_state); - log_state.generation_id++; - log_state.next_index = log_state.current_pass_no = 0; - WRITE_ONCE(mi->mi_log.rl_state, log_state); - write_unlock(&mi->mi_log.rl_access_lock); + spin_lock(&mi->mi_log.rl_lock); + old_buffer = mi->mi_log.rl_ring_buf; + mi->mi_log.rl_ring_buf = new_buffer; + mi->mi_log.rl_size = new_buffer_size; + log_state = (struct read_log_state){ + .generation_id = mi->mi_log.rl_head.generation_id + 1, + }; + mi->mi_log.rl_head = log_state; + mi->mi_log.rl_tail = log_state; + spin_unlock(&mi->mi_log.rl_lock); + + kfree(old_buffer); } mi->mi_options = *options; @@ -246,37 +248,121 @@ static ssize_t decompress(struct mem_range src, struct mem_range dst) return result; } +static void log_read_one_record(struct read_log *rl, struct read_log_state *rs) +{ + union log_record *record = + (union log_record *)((u8 *)rl->rl_ring_buf + rs->next_offset); + size_t record_size; + + switch (record->full_record.type) { + case FULL: + rs->base_record = record->full_record; + record_size = sizeof(record->full_record); + break; + + case SAME_FILE: + rs->base_record.block_index = + record->same_file_record.block_index; + rs->base_record.absolute_ts_us += + record->same_file_record.relative_ts_us; + record_size = sizeof(record->same_file_record); + break; + + case SAME_FILE_NEXT_BLOCK: + ++rs->base_record.block_index; + rs->base_record.absolute_ts_us += + record->same_file_next_block.relative_ts_us; + record_size = sizeof(record->same_file_next_block); + break; + + case SAME_FILE_NEXT_BLOCK_SHORT: + ++rs->base_record.block_index; + rs->base_record.absolute_ts_us += + record->same_file_next_block_short.relative_ts_us; + record_size = sizeof(record->same_file_next_block_short); + break; + } + + rs->next_offset += record_size; + if (rs->next_offset > rl->rl_size - sizeof(*record)) { + rs->next_offset = 0; + ++rs->current_pass_no; + } + ++rs->current_record_no; +} + static void log_block_read(struct mount_info *mi, incfs_uuid_t *id, - int block_index, bool timed_out) + int block_index) { struct read_log *log = &mi->mi_log; - struct read_log_state state; + struct read_log_state *head, *tail; s64 now_us = ktime_to_us(ktime_get()); - int rl_size; - struct read_log_record record = { - .file_id = *id, - .block_index = block_index, - .timed_out = timed_out, - .timestamp_us = now_us - }; - - read_lock(&log->rl_access_lock); - rl_size = READ_ONCE(log->rl_size); - if (rl_size != 0) { - spin_lock(&log->rl_logging_lock); - state = READ_ONCE(log->rl_state); - log->rl_ring_buf[state.next_index] = record; - if (++state.next_index == rl_size) { - state.next_index = 0; - ++state.current_pass_no; - } - WRITE_ONCE(log->rl_state, state); - spin_unlock(&log->rl_logging_lock); + s64 relative_us; + union log_record record; + size_t record_size; + + spin_lock(&log->rl_lock); + if (log->rl_size == 0) { + spin_unlock(&log->rl_lock); + return; } - read_unlock(&log->rl_access_lock); - if (rl_size != 0) - wake_up_all(&log->ml_notif_wq); + head = &log->rl_head; + tail = &log->rl_tail; + relative_us = now_us - head->base_record.absolute_ts_us; + + if (memcmp(id, &head->base_record.file_id, sizeof(incfs_uuid_t)) || + relative_us >= 1ll << 32) { + record.full_record = (struct full_record){ + .type = FULL, + .block_index = block_index, + .file_id = *id, + .absolute_ts_us = now_us, + }; + record_size = sizeof(struct full_record); + } else if (block_index != head->base_record.block_index + 1 || + relative_us >= 1 << 30) { + record.same_file_record = (struct same_file_record){ + .type = SAME_FILE, + .block_index = block_index, + .relative_ts_us = relative_us, + }; + record_size = sizeof(struct same_file_record); + } else if (relative_us >= 1 << 14) { + record.same_file_next_block = (struct same_file_next_block){ + .type = SAME_FILE_NEXT_BLOCK, + .relative_ts_us = relative_us, + }; + record_size = sizeof(struct same_file_next_block); + } else { + record.same_file_next_block_short = + (struct same_file_next_block_short){ + .type = SAME_FILE_NEXT_BLOCK_SHORT, + .relative_ts_us = relative_us, + }; + record_size = sizeof(struct same_file_next_block_short); + } + + head->base_record.file_id = *id; + head->base_record.block_index = block_index; + head->base_record.absolute_ts_us = now_us; + + /* Advance tail beyond area we are going to overwrite */ + while (tail->current_pass_no < head->current_pass_no && + tail->next_offset < head->next_offset + record_size) + log_read_one_record(log, tail); + + memcpy(((u8 *)log->rl_ring_buf) + head->next_offset, &record, + record_size); + head->next_offset += record_size; + if (head->next_offset > log->rl_size - sizeof(record)) { + head->next_offset = 0; + ++head->current_pass_no; + } + ++head->current_record_no; + + spin_unlock(&log->rl_lock); + wake_up_all(&log->ml_notif_wq); } static int validate_hash_tree(struct file *bf, struct data_file *df, @@ -708,8 +794,7 @@ static int wait_for_data_block(struct data_file *df, int block_index, mi = df->df_mount_info; if (timeout_ms == 0) { - log_block_read(mi, &df->df_id, block_index, - true /*timed out*/); + log_block_read(mi, &df->df_id, block_index); return -ETIME; } @@ -728,8 +813,7 @@ static int wait_for_data_block(struct data_file *df, int block_index, if (wait_res == 0) { /* Wait has timed out */ - log_block_read(mi, &df->df_id, block_index, - true /*timed out*/); + log_block_read(mi, &df->df_id, block_index); return -ETIME; } if (wait_res < 0) { @@ -825,7 +909,7 @@ ssize_t incfs_read_data_file_block(struct mem_range dst, struct data_file *df, } if (result >= 0) - log_block_read(mi, &df->df_id, index, false /*timed out*/); + log_block_read(mi, &df->df_id, index); out: return result; @@ -1195,49 +1279,29 @@ struct read_log_state incfs_get_log_state(struct mount_info *mi) struct read_log *log = &mi->mi_log; struct read_log_state result; - read_lock(&log->rl_access_lock); - spin_lock(&log->rl_logging_lock); - result = READ_ONCE(log->rl_state); - spin_unlock(&log->rl_logging_lock); - read_unlock(&log->rl_access_lock); + spin_lock(&log->rl_lock); + result = log->rl_head; + spin_unlock(&log->rl_lock); return result; } -static u64 calc_record_count(const struct read_log_state *state, int rl_size) -{ - return state->current_pass_no * (u64)rl_size + state->next_index; -} - int incfs_get_uncollected_logs_count(struct mount_info *mi, - struct read_log_state state) + const struct read_log_state *state) { struct read_log *log = &mi->mi_log; - struct read_log_state rl_state; - int rl_size; - u64 count; - - read_lock(&log->rl_access_lock); - rl_size = READ_ONCE(log->rl_size); - spin_lock(&log->rl_logging_lock); - rl_state = READ_ONCE(log->rl_state); - spin_unlock(&log->rl_logging_lock); - read_unlock(&log->rl_access_lock); - - count = calc_record_count(&rl_state, rl_size); - if (rl_state.generation_id == state.generation_id) - count -= calc_record_count(&state, rl_size); - return min_t(int, count, rl_size); -} - -static void fill_pending_read_from_log_record( - struct incfs_pending_read_info *dest, const struct read_log_record *src, - struct read_log_state *state, u64 log_size) -{ - dest->file_id = src->file_id; - dest->block_index = src->block_index; - dest->serial_number = - state->current_pass_no * log_size + state->next_index; - dest->timestamp_us = src->timestamp_us; + u32 generation; + u64 head_no, tail_no; + + spin_lock(&log->rl_lock); + tail_no = log->rl_tail.current_record_no; + head_no = log->rl_head.current_record_no; + generation = log->rl_head.generation_id; + spin_unlock(&log->rl_lock); + + if (generation != state->generation_id) + return head_no - tail_no; + else + return head_no - max_t(u64, tail_no, state->current_record_no); } int incfs_collect_logged_reads(struct mount_info *mi, @@ -1245,82 +1309,48 @@ int incfs_collect_logged_reads(struct mount_info *mi, struct incfs_pending_read_info *reads, int reads_size) { - struct read_log *log = &mi->mi_log; - struct read_log_state live_state; int dst_idx; - int rl_size; - int result = 0; - u64 read_count; - u64 written_count; - - read_lock(&log->rl_access_lock); - - rl_size = READ_ONCE(log->rl_size); - spin_lock(&log->rl_logging_lock); - live_state = READ_ONCE(log->rl_state); - spin_unlock(&log->rl_logging_lock); + struct read_log *log = &mi->mi_log; + struct read_log_state *head, *tail; - if (reader_state->generation_id != live_state.generation_id) { - reader_state->generation_id = live_state.generation_id; - reader_state->current_pass_no = reader_state->next_index = 0; - } + spin_lock(&log->rl_lock); + head = &log->rl_head; + tail = &log->rl_tail; - read_count = calc_record_count(reader_state, rl_size); - written_count = calc_record_count(&live_state, rl_size); - if (read_count == written_count) { - result = 0; - goto out; - } - if (reader_state->next_index >= rl_size) { - result = -ERANGE; - goto out; - } + if (reader_state->generation_id != head->generation_id) { + pr_debug("read ptr is wrong generation: %u/%u", + reader_state->generation_id, head->generation_id); - if (read_count > written_count) { - /* This reader is somehow ahead of the writer. */ - pr_debug("incfs: Log reader is ahead of writer\n"); - *reader_state = live_state; + *reader_state = (struct read_log_state){ + .generation_id = head->generation_id, + }; } - if (written_count - read_count > rl_size) { - /* - * Reading pointer is too far behind, - * start from the record following the write pointer. - */ - pr_debug( - "incfs: read pointer is behind, moving: %u/%u -> %u/%u / %u\n", - (u32)reader_state->next_index, - (u32)reader_state->current_pass_no, - (u32)live_state.next_index, - (u32)live_state.current_pass_no - 1, (u32)rl_size); + if (reader_state->current_record_no < tail->current_record_no) { + pr_debug("read ptr is behind, moving: %u/%u -> %u/%u\n", + (u32)reader_state->next_offset, + (u32)reader_state->current_pass_no, + (u32)tail->next_offset, (u32)tail->current_pass_no); - *reader_state = (struct read_log_state){ - .next_index = live_state.next_index, - .current_pass_no = live_state.current_pass_no - 1, - }; + *reader_state = *tail; } for (dst_idx = 0; dst_idx < reads_size; dst_idx++) { - if (reader_state->next_index == live_state.next_index && - reader_state->current_pass_no == live_state.current_pass_no) + if (reader_state->current_record_no == head->current_record_no) break; - fill_pending_read_from_log_record( - &reads[dst_idx], - &log->rl_ring_buf[reader_state->next_index], - reader_state, rl_size); + log_read_one_record(log, reader_state); - reader_state->next_index++; - if (reader_state->next_index == rl_size) { - reader_state->next_index = 0; - reader_state->current_pass_no++; - } + reads[dst_idx] = (struct incfs_pending_read_info){ + .file_id = reader_state->base_record.file_id, + .block_index = reader_state->base_record.block_index, + .serial_number = reader_state->current_record_no, + .timestamp_us = reader_state->base_record.absolute_ts_us + }; } - result = dst_idx; -out: - read_unlock(&log->rl_access_lock); - return result; + spin_unlock(&log->rl_lock); + return dst_idx; } bool incfs_equal_ranges(struct mem_range lhs, struct mem_range rhs) diff --git a/fs/incfs/data_mgmt.h b/fs/incfs/data_mgmt.h index b860997d0bb6..b7aecdd5bf4a 100644 --- a/fs/incfs/data_mgmt.h +++ b/fs/incfs/data_mgmt.h @@ -20,50 +20,74 @@ #define SEGMENTS_PER_FILE 3 -struct read_log_record { - u32 block_index : 31; - - u32 timed_out : 1; - - u64 timestamp_us; +enum LOG_RECORD_TYPE { + FULL, + SAME_FILE, + SAME_FILE_NEXT_BLOCK, + SAME_FILE_NEXT_BLOCK_SHORT, +}; +struct full_record { + enum LOG_RECORD_TYPE type : 2; /* FULL */ + u32 block_index : 30; incfs_uuid_t file_id; -} __packed; + u64 absolute_ts_us; +} __packed; /* 28 bytes */ + +struct same_file_record { + enum LOG_RECORD_TYPE type : 2; /* SAME_FILE */ + u32 block_index : 30; + u32 relative_ts_us; /* max 2^32 us ~= 1 hour (1:11:30) */ +} __packed; /* 12 bytes */ + +struct same_file_next_block { + enum LOG_RECORD_TYPE type : 2; /* SAME_FILE_NEXT_BLOCK */ + u32 relative_ts_us : 30; /* max 2^30 us ~= 15 min (17:50) */ +} __packed; /* 4 bytes */ + +struct same_file_next_block_short { + enum LOG_RECORD_TYPE type : 2; /* SAME_FILE_NEXT_BLOCK_SHORT */ + u16 relative_ts_us : 14; /* max 2^14 us ~= 16 ms */ +} __packed; /* 2 bytes */ + +union log_record { + struct full_record full_record; + struct same_file_record same_file_record; + struct same_file_next_block same_file_next_block; + struct same_file_next_block_short same_file_next_block_short; +}; struct read_log_state { /* Log buffer generation id, incremented on configuration changes */ - u32 generation_id : 8; + u32 generation_id; - /* Next slot in rl_ring_buf to write into. */ - u32 next_index : 24; + /* Offset in rl_ring_buf to write into. */ + u32 next_offset; /* Current number of writer passes over rl_ring_buf */ u32 current_pass_no; + + /* Current full_record to diff against */ + struct full_record base_record; + + /* Current record number counting from configuration change */ + u64 current_record_no; }; /* A ring buffer to save records about data blocks which were recently read. */ struct read_log { - struct read_log_record *rl_ring_buf; + void *rl_ring_buf; int rl_size; - struct read_log_state rl_state; + struct read_log_state rl_head; - /* - * A lock for _all_ accesses to the struct, to protect against remounts. - * Taken for writing when resizing the buffer. - */ - rwlock_t rl_access_lock; + struct read_log_state rl_tail; - /* - * A lock to protect the actual logging - adding a new record. - * Note: ALWAYS taken after and under the |rl_access_lock|. - */ - spinlock_t rl_logging_lock; + /* A lock to protect the above fields */ + spinlock_t rl_lock; - /* - * A queue of waiters who want to be notified about reads. - */ + /* A queue of waiters who want to be notified about reads */ wait_queue_head_t ml_notif_wq; }; @@ -281,7 +305,7 @@ int incfs_collect_logged_reads(struct mount_info *mi, int reads_size); struct read_log_state incfs_get_log_state(struct mount_info *mi); int incfs_get_uncollected_logs_count(struct mount_info *mi, - struct read_log_state state); + const struct read_log_state *state); static inline struct inode_info *get_incfs_node(struct inode *inode) { diff --git a/fs/incfs/vfs.c b/fs/incfs/vfs.c index 0a13821f5b59..ec028fc96303 100644 --- a/fs/incfs/vfs.c +++ b/fs/incfs/vfs.c @@ -643,7 +643,7 @@ static __poll_t log_poll(struct file *file, poll_table *wait) __poll_t ret = 0; poll_wait(file, &mi->mi_log.ml_notif_wq, wait); - count = incfs_get_uncollected_logs_count(mi, log_state->state); + count = incfs_get_uncollected_logs_count(mi, &log_state->state); if (count >= mi->mi_options.read_log_wakeup_count) ret = EPOLLIN | EPOLLRDNORM; diff --git a/tools/testing/selftests/filesystems/incfs/incfs_test.c b/tools/testing/selftests/filesystems/incfs/incfs_test.c index f9661a9eb3fa..41a23b605e13 100644 --- a/tools/testing/selftests/filesystems/incfs/incfs_test.c +++ b/tools/testing/selftests/filesystems/incfs/incfs_test.c @@ -1930,16 +1930,19 @@ failure: return TEST_FAILURE; } +enum expected_log { FULL_LOG, NO_LOG, PARTIAL_LOG }; + static int validate_logs(char *mount_dir, int log_fd, struct test_file *file, - bool no_rlog) + enum expected_log expected_log) { uint8_t data[INCFS_DATA_FILE_BLOCK_SIZE]; - struct incfs_pending_read_info prs[100] = {}; + struct incfs_pending_read_info prs[2048] = {}; int prs_size = ARRAY_SIZE(prs); int block_cnt = 1 + (file->size - 1) / INCFS_DATA_FILE_BLOCK_SIZE; + int expected_read_block_cnt; int res; int read_count; - int i; + int i, j; char *filename = concat_file_name(mount_dir, file->name); int fd; @@ -1950,17 +1953,29 @@ static int validate_logs(char *mount_dir, int log_fd, struct test_file *file, if (block_cnt > prs_size) block_cnt = prs_size; + expected_read_block_cnt = block_cnt; for (i = 0; i < block_cnt; i++) { res = pread(fd, data, sizeof(data), INCFS_DATA_FILE_BLOCK_SIZE * i); + + /* Make some read logs of type SAME_FILE_NEXT_BLOCK */ + if (i % 10 == 0) + usleep(20000); + + /* Skip some blocks to make logs of type SAME_FILE */ + if (i % 10 == 5) { + ++i; + --expected_read_block_cnt; + } + if (res <= 0) goto failure; } - read_count = - wait_for_pending_reads(log_fd, no_rlog ? 10 : 0, prs, prs_size); - if (no_rlog) { + read_count = wait_for_pending_reads( + log_fd, expected_log == NO_LOG ? 10 : 0, prs, prs_size); + if (expected_log == NO_LOG) { if (read_count == 0) goto success; if (read_count < 0) @@ -1977,14 +1992,26 @@ static int validate_logs(char *mount_dir, int log_fd, struct test_file *file, goto failure; } - if (read_count != block_cnt) { + i = 0; + if (expected_log == PARTIAL_LOG) { + if (read_count == 0) { + ksft_print_msg("No logs %s.\n", file->name); + goto failure; + } + + for (i = 0, j = 0; j < expected_read_block_cnt - read_count; + i++, j++) + if (i % 10 == 5) + ++i; + + } else if (read_count != expected_read_block_cnt) { ksft_print_msg("Bad log read count %s %d %d.\n", file->name, - read_count, block_cnt); + read_count, expected_read_block_cnt); goto failure; } - for (i = 0; i < read_count; i++) { - struct incfs_pending_read_info *read = &prs[i]; + for (j = 0; j < read_count; i++, j++) { + struct incfs_pending_read_info *read = &prs[j]; if (!same_id(&read->file_id, &file->id)) { ksft_print_msg("Bad log read ino %s\n", file->name); @@ -1997,8 +2024,8 @@ static int validate_logs(char *mount_dir, int log_fd, struct test_file *file, goto failure; } - if (i != 0) { - unsigned long psn = prs[i - 1].serial_number; + if (j != 0) { + unsigned long psn = prs[j - 1].serial_number; if (read->serial_number != psn + 1) { ksft_print_msg("Bad log read sn %s %d %d.\n", @@ -2013,6 +2040,9 @@ static int validate_logs(char *mount_dir, int log_fd, struct test_file *file, file->name); goto failure; } + + if (i % 10 == 5) + ++i; } success: @@ -2063,7 +2093,7 @@ static int read_log_test(char *mount_dir) for (i = 0; i < file_num; i++) { struct test_file *file = &test.files[i]; - if (validate_logs(mount_dir, log_fd, file, false)) + if (validate_logs(mount_dir, log_fd, file, FULL_LOG)) goto failure; } @@ -2091,7 +2121,7 @@ static int read_log_test(char *mount_dir) for (i = 0; i < file_num; i++) { struct test_file *file = &test.files[i]; - if (validate_logs(mount_dir, log_fd, file, false)) + if (validate_logs(mount_dir, log_fd, file, FULL_LOG)) goto failure; } @@ -2118,7 +2148,30 @@ static int read_log_test(char *mount_dir) for (i = 0; i < file_num; i++) { struct test_file *file = &test.files[i]; - if (validate_logs(mount_dir, log_fd, file, true)) + if (validate_logs(mount_dir, log_fd, file, NO_LOG)) + goto failure; + } + + /* + * Remount and check that logs start working again + */ + drop_caches = open("/proc/sys/vm/drop_caches", O_WRONLY | O_CLOEXEC); + if (drop_caches == -1) + goto failure; + i = write(drop_caches, "3", 1); + close(drop_caches); + if (i != 1) + goto failure; + + if (mount_fs_opt(mount_dir, backing_dir, "readahead=0,rlog_pages=1", + true) != 0) + goto failure; + + /* Validate data again */ + for (i = 0; i < file_num; i++) { + struct test_file *file = &test.files[i]; + + if (validate_logs(mount_dir, log_fd, file, PARTIAL_LOG)) goto failure; } @@ -2141,7 +2194,7 @@ static int read_log_test(char *mount_dir) for (i = 0; i < file_num; i++) { struct test_file *file = &test.files[i]; - if (validate_logs(mount_dir, log_fd, file, false)) + if (validate_logs(mount_dir, log_fd, file, FULL_LOG)) goto failure; } @@ -2292,8 +2345,6 @@ static int validate_ranges(const char *mount_dir, struct test_file *file) if (fba.start_index >= block_cnt) { if (fba.index_out != fba.start_index) { - printf("Paul: %d, %d\n", (int)fba.index_out, - (int)fba.start_index); error = -EINVAL; goto out; } @@ -2645,7 +2696,7 @@ int main(int argc, char *argv[]) rmdir(mount_dir); if (fails > 0) - ksft_exit_pass(); + ksft_exit_fail(); else ksft_exit_pass(); return 0;