📄 recovery-small.sh
字号:
#!/bin/bashset -e# bug 5493ALWAYS_EXCEPT="52 $RECOVERY_SMALL_EXCEPT"#PTLDEBUG=${PTLDEBUG:--1}LUSTRE=${LUSTRE:-`dirname $0`/..}. $LUSTRE/tests/test-framework.shinit_test_env $@. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}# also long tests: 19, 21a, 21e, 21f, 23, 27# 1 2.5 2.5 4 4 (min)"[ "$SLOW" = "no" ] && EXCEPT_SLOW="17 26a 26b 50 51 57"build_test_filter# Allow us to override the setup if we already have a mounted system by# setting SETUP=" " and CLEANUP=" "SETUP=${SETUP:-""}CLEANUP=${CLEANUP:-""}cleanup_and_setup_lustrerm -rf $DIR/[df][0-9]*test_1() { drop_request "mcreate $MOUNT/1" || return 1 drop_reint_reply "mcreate $MOUNT/2" || return 2}run_test 1 "mcreate: drop req, drop rep"test_2() { drop_request "tchmod 111 $MOUNT/2" || return 1 drop_reint_reply "tchmod 666 $MOUNT/2" || return 2}run_test 2 "chmod: drop req, drop rep"test_3() { drop_request "statone $MOUNT/2" || return 1 drop_reply "statone $MOUNT/2" || return 2}run_test 3 "stat: drop req, drop rep"SAMPLE_NAME=recovery-small.junkSAMPLE_FILE=$TMP/$SAMPLE_NAME# make this big, else test 9 doesn't wait for bulk -- bz 5595dd if=/dev/urandom of=$SAMPLE_FILE bs=1M count=4test_4() { do_facet client "cp $SAMPLE_FILE $MOUNT/$SAMPLE_NAME" || return 1 drop_request "cat $MOUNT/$SAMPLE_NAME > /dev/null" || return 2 drop_reply "cat $MOUNT/$SAMPLE_NAME > /dev/null" || return 3}run_test 4 "open: drop req, drop rep"test_5() { drop_request "mv $MOUNT/$SAMPLE_NAME $MOUNT/renamed" || return 1 drop_reint_reply "mv $MOUNT/renamed $MOUNT/renamed-again" || return 2 do_facet client "checkstat -v $MOUNT/renamed-again" || return 3}run_test 5 "rename: drop req, drop rep"[ ! -e $MOUNT/renamed-again ] && cp $SAMPLE_FILE $MOUNT/renamed-againtest_6() { drop_request "mlink $MOUNT/renamed-again $MOUNT/link1" || return 1 drop_reint_reply "mlink $MOUNT/renamed-again $MOUNT/link2" || return 2}run_test 6 "link: drop req, drop rep"[ ! -e $MOUNT/link1 ] && mlink $MOUNT/renamed-again $MOUNT/link1[ ! -e $MOUNT/link2 ] && mlink $MOUNT/renamed-again $MOUNT/link2test_7() { drop_request "munlink $MOUNT/link1" || return 1 drop_reint_reply "munlink $MOUNT/link2" || return 2}run_test 7 "unlink: drop req, drop rep"#bug 1423test_8() { drop_reint_reply "touch $MOUNT/$tfile" || return 1}run_test 8 "touch: drop rep (bug 1423)"SAMPLE_FILE=$TMP/recovery-small.junkdd if=/dev/urandom of=$SAMPLE_FILE bs=1M count=4#bug 1420test_9() { pause_bulk "cp /etc/profile $MOUNT/$tfile" || return 1 do_facet client "cp $SAMPLE_FILE $MOUNT/${tfile}.2" || return 2 do_facet client "sync" do_facet client "rm $MOUNT/$tfile $MOUNT/${tfile}.2" || return 3}run_test 9 "pause bulk on OST (bug 1420)"#bug 1521test_10() { do_facet client mcreate $MOUNT/$tfile || return 1 drop_bl_callback "chmod 0777 $MOUNT/$tfile" || echo "evicted as expected" # wait for the mds to evict the client #echo "sleep $(($TIMEOUT*2))" #sleep $(($TIMEOUT*2)) do_facet client touch $MOUNT/$tfile || echo "touch failed, evicted" do_facet client checkstat -v -p 0777 $MOUNT/$tfile || return 3 do_facet client "munlink $MOUNT/$tfile"}run_test 10 "finish request on server after client eviction (bug 1521)"#bug 2460# wake up a thread waiting for completion after evictiontest_11(){ do_facet client multiop $MOUNT/$tfile Ow || return 1 do_facet client multiop $MOUNT/$tfile or || return 2 cancel_lru_locks osc do_facet client multiop $MOUNT/$tfile or || return 3 drop_bl_callback multiop $MOUNT/$tfile Ow || echo "evicted as expected" do_facet client munlink $MOUNT/$tfile || return 4}run_test 11 "wake up a thread waiting for completion after eviction (b=2460)"#b=2494test_12(){ $LCTL mark multiop $MOUNT/$tfile OS_c do_facet mds "sysctl -w lustre.fail_loc=0x115" clear_failloc mds $((TIMEOUT * 2)) & multiop_bg_pause $MOUNT/$tfile OS_c || return 1 PID=$!#define OBD_FAIL_MDS_CLOSE_NET 0x115 kill -USR1 $PID echo "waiting for multiop $PID" wait $PID || return 2 do_facet client munlink $MOUNT/$tfile || return 3}run_test 12 "recover from timed out resend in ptlrpcd (b=2494)"# Bug 113, check that readdir lost recv timeout works.test_13() { mkdir $MOUNT/readdir || return 1 touch $MOUNT/readdir/newentry || return# OBD_FAIL_MDS_READPAGE_NET|OBD_FAIL_ONCE do_facet mds "sysctl -w lustre.fail_loc=0x80000104" ls $MOUNT/readdir || return 3 do_facet mds "sysctl -w lustre.fail_loc=0" rm -rf $MOUNT/readdir || return 4}run_test 13 "mdc_readpage restart test (bug 1138)"# Bug 113, check that readdir lost send timeout works.test_14() { mkdir $MOUNT/readdir touch $MOUNT/readdir/newentry# OBD_FAIL_MDS_SENDPAGE|OBD_FAIL_ONCE do_facet mds "sysctl -w lustre.fail_loc=0x80000106" ls $MOUNT/readdir || return 1 do_facet mds "sysctl -w lustre.fail_loc=0"}run_test 14 "mdc_readpage resend test (bug 1138)"test_15() { do_facet mds "sysctl -w lustre.fail_loc=0x80000128" touch $DIR/$tfile && return 1 return 0}run_test 15 "failed open (-ENOMEM)"READ_AHEAD=`lctl get_param -n llite.*.max_read_ahead_mb | head -n 1`stop_read_ahead() { lctl set_param -n llite.*.max_read_ahead_mb 0}start_read_ahead() { lctl set_param -n llite.*.max_read_ahead_mb $READ_AHEAD}test_16() { do_facet client cp $SAMPLE_FILE $MOUNT sync stop_read_ahead#define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 | OBD_FAIL_ONCE do_facet ost1 sysctl -w lustre.fail_loc=0x80000504 cancel_lru_locks osc # OST bulk will time out here, client resends do_facet client "cmp $SAMPLE_FILE $MOUNT/${SAMPLE_FILE##*/}" || return 1 do_facet ost1 sysctl -w lustre.fail_loc=0 # give recovery a chance to finish (shouldn't take long) sleep $TIMEOUT do_facet client "cmp $SAMPLE_FILE $MOUNT/${SAMPLE_FILE##*/}" || return 2 start_read_ahead}run_test 16 "timeout bulk put, don't evict client (2732)"test_17() { local at_max_saved=0 # With adaptive timeouts, bulk_get won't expire until adaptive_timeout_max if at_is_valid && at_is_enabled; then at_max_saved=$(at_max_get ost1) at_max_set $TIMEOUT ost1 fi # OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE # OST bulk will time out here, client retries do_facet ost1 sysctl -w lustre.fail_loc=0x80000503 # need to ensure we send an RPC do_facet client cp $SAMPLE_FILE $DIR/$tfile sync # with AT, client will wait adaptive_max*factor+net_latency before # expiring the req, hopefully timeout*2 is enough sleep $(($TIMEOUT*2)) do_facet ost1 sysctl -w lustre.fail_loc=0 do_facet client "df $DIR" # expect cmp to succeed, client resent bulk do_facet client "cmp $SAMPLE_FILE $DIR/$tfile" || return 3 do_facet client "rm $DIR/$tfile" || return 4 [ $at_max_saved -ne 0 ] && $(at_max_set $at_max_saved ost1) return 0}run_test 17 "timeout bulk get, don't evict client (2732)"test_18a() { [ -z ${ost2_svc} ] && skip "needs 2 osts" && return 0 do_facet client mkdir -p $MOUNT/$tdir f=$MOUNT/$tdir/$tfile cancel_lru_locks osc pgcache_empty || return 1 # 1 stripe on ost2 lfs setstripe $f -s $((128 * 1024)) -i 1 -c 1 do_facet client cp $SAMPLE_FILE $f sync local osc2dev=`lctl get_param -n devices | grep ${ost2_svc}-osc- | awk '{print $1}'` $LCTL --device $osc2dev deactivate || return 3 # my understanding is that there should be nothing in the page # cache after the client reconnects? rc=0 pgcache_empty || rc=2 $LCTL --device $osc2dev activate rm -f $f return $rc}run_test 18a "manual ost invalidate clears page cache immediately"test_18b() { do_facet client mkdir -p $MOUNT/$tdir f=$MOUNT/$tdir/$tfile f2=$MOUNT/$tdir/${tfile}-2 cancel_lru_locks osc pgcache_empty || return 1 # shouldn't have to set stripe size of count==1 lfs setstripe $f -s $((128 * 1024)) -i 0 -c 1 lfs setstripe $f2 -s $((128 * 1024)) -i 0 -c 1 do_facet client cp $SAMPLE_FILE $f sync ost_evict_client # force reconnect df $MOUNT > /dev/null 2>&1 sleep 2 # my understanding is that there should be nothing in the page # cache after the client reconnects? rc=0 pgcache_empty || rc=2 rm -f $f $f2 return $rc}run_test 18b "eviction and reconnect clears page cache (2766)"test_18c() { do_facet client mkdir -p $MOUNT/$tdir f=$MOUNT/$tdir/$tfile f2=$MOUNT/$tdir/${tfile}-2 cancel_lru_locks osc pgcache_empty || return 1 # shouldn't have to set stripe size of count==1 lfs setstripe $f -s $((128 * 1024)) -i 0 -c 1 lfs setstripe $f2 -s $((128 * 1024)) -i 0 -c 1 do_facet client cp $SAMPLE_FILE $f sync ost_evict_client # OBD_FAIL_OST_CONNECT_NET2 # lost reply to connect request do_facet ost1 sysctl -w lustre.fail_loc=0x80000225 # force reconnect df $MOUNT > /dev/null 2>&1 sleep 2 # my understanding is that there should be nothing in the page # cache after the client reconnects? rc=0 pgcache_empty || rc=2 rm -f $f $f2 return $rc}run_test 18c "Dropped connect reply after eviction handing (14755)"test_19a() { f=$MOUNT/$tfile do_facet client mcreate $f || return 1 drop_ldlm_cancel "chmod 0777 $f" || echo "evicted as expected" do_facet client checkstat -v -p 0777 $f || echo evicted # let the client reconnect sleep 5 do_facet client "munlink $f"}run_test 19a "test expired_lock_main on mds (2867)"test_19b() { f=$MOUNT/$tfile do_facet client multiop $f Ow || return 1 do_facet client multiop $f or || return 2 cancel_lru_locks osc do_facet client multiop $f or || return 3 drop_ldlm_cancel multiop $f Ow || echo "client evicted, as expected" do_facet client munlink $f || return 4}run_test 19b "test expired_lock_main on ost (2867)"test_20a() { # bug 2983 - ldlm_handle_enqueue cleanup mkdir -p $DIR/$tdir lfs setstripe $DIR/$tdir/${tfile} -i 0 -c 1 multiop_bg_pause $DIR/$tdir/${tfile} O_wc || return 1 MULTI_PID=$! cancel_lru_locks osc#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 do_facet ost1 sysctl -w lustre.fail_loc=0x80000308 kill -USR1 $MULTI_PID wait $MULTI_PID rc=$? [ $rc -eq 0 ] && error "multiop didn't fail enqueue: rc $rc" || true}run_test 20a "ldlm_handle_enqueue error (should return error)" test_20b() { # bug 2986 - ldlm_handle_enqueue error during open mkdir -p $DIR/$tdir lfs setstripe $DIR/$tdir/${tfile} -i 0 -c 1 cancel_lru_locks osc#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 do_facet ost1 sysctl -w lustre.fail_loc=0x80000308 dd if=/etc/hosts of=$DIR/$tdir/$tfile && \ error "didn't fail open enqueue" || true}run_test 20b "ldlm_handle_enqueue error (should return error)"test_21a() { mkdir -p $DIR/$tdir-1 mkdir -p $DIR/$tdir-2 multiop_bg_pause $DIR/$tdir-1/f O_c || return 1 close_pid=$! do_facet mds "sysctl -w lustre.fail_loc=0x80000129" multiop $DIR/$tdir-2/f Oc & open_pid=$! sleep 1 do_facet mds "sysctl -w lustre.fail_loc=0" do_facet mds "sysctl -w lustre.fail_loc=0x80000115" kill -USR1 $close_pid cancel_lru_locks mdc wait $close_pid || return 1 wait $open_pid || return 2 do_facet mds "sysctl -w lustre.fail_loc=0" $CHECKSTAT -t file $DIR/$tdir-1/f || return 3 $CHECKSTAT -t file $DIR/$tdir-2/f || return 4 rm -rf $DIR/$tdir-*}run_test 21a "drop close request while close and open are both in flight"test_21b() { mkdir -p $DIR/$tdir-1 mkdir -p $DIR/$tdir-2 multiop_bg_pause $DIR/$tdir-1/f O_c || return 1 close_pid=$! do_facet mds "sysctl -w lustre.fail_loc=0x80000107" mcreate $DIR/$tdir-2/f & open_pid=$! sleep 1 do_facet mds "sysctl -w lustre.fail_loc=0" kill -USR1 $close_pid cancel_lru_locks mdc wait $close_pid || return 1 wait $open_pid || return 3 $CHECKSTAT -t file $DIR/$tdir-1/f || return 4 $CHECKSTAT -t file $DIR/$tdir-2/f || return 5 rm -rf $DIR/$tdir-*}run_test 21b "drop open request while close and open are both in flight"test_21c() { mkdir -p $DIR/$tdir-1 mkdir -p $DIR/$tdir-2 multiop_bg_pause $DIR/$tdir-1/f O_c || return 1 close_pid=$! do_facet mds "sysctl -w lustre.fail_loc=0x80000107" mcreate $DIR/$tdir-2/f & open_pid=$! sleep 3 do_facet mds "sysctl -w lustre.fail_loc=0" do_facet mds "sysctl -w lustre.fail_loc=0x80000115" kill -USR1 $close_pid cancel_lru_locks mdc wait $close_pid || return 1 wait $open_pid || return 2 do_facet mds "sysctl -w lustre.fail_loc=0" $CHECKSTAT -t file $DIR/$tdir-1/f || return 2 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3 rm -rf $DIR/$tdir-*}run_test 21c "drop both request while close and open are both in flight"test_21d() { mkdir -p $DIR/$tdir-1 mkdir -p $DIR/$tdir-2 multiop_bg_pause $DIR/$tdir-1/f O_c || return 1 pid=$! do_facet mds "sysctl -w lustre.fail_loc=0x80000129" multiop $DIR/$tdir-2/f Oc & sleep 1 do_facet mds "sysctl -w lustre.fail_loc=0" do_facet mds "sysctl -w lustre.fail_loc=0x80000122" kill -USR1 $pid cancel_lru_locks mdc wait $pid || return 1 do_facet mds "sysctl -w lustre.fail_loc=0" $CHECKSTAT -t file $DIR/$tdir-1/f || return 2 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3 rm -rf $DIR/$tdir-*}run_test 21d "drop close reply while close and open are both in flight"test_21e() { mkdir -p $DIR/$tdir-1 mkdir -p $DIR/$tdir-2 multiop_bg_pause $DIR/$tdir-1/f O_c || return 1 pid=$!
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -