📄 replay-single.sh
字号:
test_53b() { mkdir -p $DIR/$tdir-1 mkdir -p $DIR/$tdir-2 multiop $DIR/$tdir-1/f O_c & close_pid=$! #define OBD_FAIL_MDS_REINT_NET 0x107 do_facet mds "sysctl -w lustre.fail_loc=0x80000107" mcreate $DIR/${tdir}-2/f & open_pid=$! sleep 1 do_facet mds "sysctl -w lustre.fail_loc=0" kill -USR1 $close_pid cancel_lru_locks MDC # force the close wait $close_pid || return 1 # open should still be here [ -d /proc/$open_pid ] || return 2 replay_barrier_nodf mds fail mds wait $open_pid || return 3 $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5 rm -rf $DIR/${tdir}-*}run_test 53b "|X| open request while two MDC requests in flight"test_53c() { mkdir -p $DIR/${tdir}-1 mkdir -p $DIR/${tdir}-2 multiop $DIR/${tdir}-1/f O_c & close_pid=$! do_facet mds "sysctl -w lustre.fail_loc=0x80000107" mcreate $DIR/${tdir}-2/f & open_pid=$! sleep 1 do_facet mds "sysctl -w lustre.fail_loc=0x80000115" kill -USR1 $close_pid cancel_lru_locks MDC # force the close replay_barrier_nodf mds fail_nodf mds wait $open_pid || return 1 sleep 2 # close should be gone [ -d /proc/$close_pid ] && return 2 do_facet mds "sysctl -w lustre.fail_loc=0" $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4 rm -rf $DIR/${tdir}-*}run_test 53c "|X| open request and close request while two MDC requests in flight"test_53d() { mkdir -p $DIR/${tdir}-1 mkdir -p $DIR/${tdir}-2 multiop $DIR/${tdir}-1/f O_c & close_pid=$! # give multiop a chance to open sleep 1 # define OBD_FAIL_MDS_CLOSE_NET_REP 0X138 do_facet mds "sysctl -w lustre.fail_loc=0x8000013b" kill -USR1 $close_pid cancel_lru_locks MDC # force the close do_facet mds "sysctl -w lustre.fail_loc=0" mcreate $DIR/${tdir}-2/f || return 1 # close should still be here [ -d /proc/$close_pid ] || return 2 replay_barrier_nodf mds fail mds wait $close_pid || return 3 $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5 rm -rf $DIR/${tdir}-*}run_test 53d "|X| close reply while two MDC requests in flight"test_53e() { mkdir -p $DIR/$tdir-1 mkdir -p $DIR/$tdir-2 multiop $DIR/$tdir-1/f O_c & close_pid=$! #define OBD_FAIL_MDS_REINT_NET_REP 0x119 do_facet mds "sysctl -w lustre.fail_loc=0x80000119" mcreate $DIR/${tdir}-2/f & open_pid=$! sleep 1 do_facet mds "sysctl -w lustre.fail_loc=0" kill -USR1 $close_pid cancel_lru_locks MDC # force the close wait $close_pid || return 1 # open should still be here [ -d /proc/$open_pid ] || return 2 replay_barrier_nodf mds fail mds wait $open_pid || return 3 $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5 rm -rf $DIR/${tdir}-*}run_test 53e "|X| open reply while two MDC requests in flight"test_53f() { mkdir -p $DIR/${tdir}-1 mkdir -p $DIR/${tdir}-2 multiop $DIR/${tdir}-1/f O_c & close_pid=$! do_facet mds "sysctl -w lustre.fail_loc=0x80000119" mcreate $DIR/${tdir}-2/f & open_pid=$! sleep 1 do_facet mds "sysctl -w lustre.fail_loc=0x8000013b" kill -USR1 $close_pid cancel_lru_locks MDC replay_barrier_nodf mds fail_nodf mds wait $open_pid || return 1 sleep 2 #close should be gone [ -d /proc/$close_pid ] && return 2 do_facet mds "sysctl -w lustre.fail_loc=0" $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4 rm -rf $DIR/${tdir}-*}run_test 53f "|X| open reply and close reply while two MDC requests in flight"test_53g() { mkdir -p $DIR/${tdir}-1 mkdir -p $DIR/${tdir}-2 multiop $DIR/${tdir}-1/f O_c & close_pid=$! do_facet mds "sysctl -w lustre.fail_loc=0x80000119" mcreate $DIR/${tdir}-2/f & open_pid=$! sleep 1 do_facet mds "sysctl -w lustre.fail_loc=0x80000115" kill -USR1 $close_pid cancel_lru_locks MDC # force the close do_facet mds "sysctl -w lustre.fail_loc=0" replay_barrier_nodf mds fail_nodf mds wait $open_pid || return 1 sleep 2 # close should be gone [ -d /proc/$close_pid ] && return 2 $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4 rm -rf $DIR/${tdir}-*}run_test 53g "|X| drop open reply and close request while close and open are both in flight"test_53h() { mkdir -p $DIR/${tdir}-1 mkdir -p $DIR/${tdir}-2 multiop $DIR/${tdir}-1/f O_c & close_pid=$! do_facet mds "sysctl -w lustre.fail_loc=0x80000107" mcreate $DIR/${tdir}-2/f & open_pid=$! sleep 1 do_facet mds "sysctl -w lustre.fail_loc=0x8000013b" kill -USR1 $close_pid cancel_lru_locks MDC # force the close sleep 1 replay_barrier_nodf mds fail_nodf mds wait $open_pid || return 1 sleep 2 # close should be gone [ -d /proc/$close_pid ] && return 2 do_facet mds "sysctl -w lustre.fail_loc=0" $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4 rm -rf $DIR/${tdir}-*}run_test 53h "|X| open request and close reply while two MDC requests in flight"#b3761 ASSERTION(hash != 0) failedtest_55() {# OBD_FAIL_MDS_OPEN_CREATE | OBD_FAIL_ONCE do_facet mds "sysctl -w lustre.fail_loc=0x8000012b" touch $DIR/$tfile & # give touch a chance to run sleep 5 do_facet mds "sysctl -w lustre.fail_loc=0x0" rm $DIR/$tfile return 0}run_test 55 "let MDS_CHECK_RESENT return the original return code instead of 0"#b3440 ASSERTION(rec->ur_fid2->id) failedtest_56() { ln -s foo $DIR/$tfile replay_barrier mds #drop_reply "cat $DIR/$tfile" fail mds sleep 10}run_test 56 "don't replay a symlink open request (3440)"#recovery one mds-ost setattr from llogtest_57() {#define OBD_FAIL_MDS_OST_SETATTR 0x12c do_facet mds "sysctl -w lustre.fail_loc=0x8000012c" touch $DIR/$tfile replay_barrier mds fail mds sleep 1 $CHECKSTAT -t file $DIR/$tfile || return 1 do_facet mds "sysctl -w lustre.fail_loc=0x0" rm $DIR/$tfile}run_test 57 "test recovery from llog for setattr op"#recovery many mds-ost setattr from llogtest_58() {#define OBD_FAIL_MDS_OST_SETATTR 0x12c do_facet mds "sysctl -w lustre.fail_loc=0x8000012c" createmany -o $DIR/$tdir/$tfile-%d 2500 replay_barrier mds fail mds sleep 2 $CHECKSTAT -t file $DIR/$tdir/$tfile-* >/dev/null || return 1 do_facet mds "sysctl -w lustre.fail_loc=0x0" unlinkmany $DIR/$tdir/$tfile-%d 2500 rmdir $DIR/$tdir}run_test 58 "test recovery from llog for setattr op (test llog_gen_rec)"# log_commit_thread vs filter_destroy race used to lead to import use after free# bug 11658test_59() { createmany -o $DIR/$tdir/$tfile-%d 200 sync unlinkmany $DIR/$tdir/$tfile-%d 200#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507 do_facet ost1 "sysctl -w lustre.fail_loc=0x507" fail ost1 fail mds do_facet ost1 "sysctl -w lustre.fail_loc=0x0" sleep 20 rmdir $DIR/$tdir}run_test 59 "test log_commit_thread vs filter_destroy race"# race between add unlink llog vs cat log init in post_recovery (only for b1_6)# bug 12086: should no oops and No ctxt error for this testtest_60() { createmany -o $DIR/$tdir/$tfile-%d 200 replay_barrier mds unlinkmany $DIR/$tdir/$tfile-%d 0 100 fail mds unlinkmany $DIR/$tdir/$tfile-%d 100 100 local no_ctxt=`dmesg | grep "No ctxt"` [ -z "$no_ctxt" ] || error "ctxt is not initialized in recovery" }run_test 60 "test llog post recovery init vs llog unlink"#test race llog recovery thread vs llog cleanuptest_61a() { createmany -o $DIR/$tdir/$tfile-%d 800 replay_barrier ost1 # OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221 unlinkmany $DIR/$tdir/$tfile-%d 800 do_facet ost "sysctl -w lustre.fail_loc=0x80000221" facet_failover ost1 sleep 10 fail ost1 sleep 30 do_facet ost "sysctl -w lustre.fail_loc=0x0" $CHECKSTAT -t file $DIR/$tdir/$tfile-* && return 1 rmdir $DIR/$tdir}run_test 61a "test race llog recovery vs llog cleanup"#test race mds llog sync vs llog cleanuptest_61b() {# OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a do_facet mds "sysctl -w lustre.fail_loc=0x8000013a" facet_failover mds sleep 10 fail mds do_facet client dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 || return 1}run_test 61b "test race mds llog sync vs llog cleanup"#test race cancel cookie cb vs llog cleanuptest_61c() {# OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 touch $DIR/$tfile do_facet ost "sysctl -w lustre.fail_loc=0x80000222" rm $DIR/$tfile sleep 10 fail ost1}run_test 61c "test race mds llog sync vs llog cleanup"#Adaptive Timeouts (bug 3055)AT_MAX_SET=0at_start(){ at_is_valid || skip "AT env is invalid" if ! at_is_enabled; then echo "AT is disabled, enable it by force temporarily" at_max_set 600 mds ost client AT_MAX_SET=1 fi if [ -z "$ATOLDBASE" ]; then local at_history=$(do_facet mds "find /sys/ -name at_history") [ -z "$at_history" ] && skip "missing /sys/.../at_history " && return 1 ATOLDBASE=$(do_facet mds "cat $at_history") # speed up the timebase so we can check decreasing AT do_facet mds "echo 8 >> $at_history" do_facet ost1 "echo 8 >> $at_history" fi}test_65a() #bug 3055{ at_start || return 0 $LCTL dk > /dev/null debugsave sysctl -w lnet.debug="+other" # slow down a request do_facet mds sysctl -w lustre.fail_val=30000#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a do_facet mds sysctl -w lustre.fail_loc=0x8000050a createmany -o $DIR/$tfile 10 > /dev/null unlinkmany $DIR/$tfile 10 > /dev/null # check for log message $LCTL dk | grep "Early reply #" || error "No early reply" # client should show 30s estimates grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts sleep 9 grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts}run_test 65a "AT: verify early replies"test_65b() #bug 3055{ at_start || return 0 # turn on D_ADAPTTO debugsave sysctl -w lnet.debug="+other" $LCTL dk > /dev/null # slow down bulk i/o do_facet ost1 sysctl -w lustre.fail_val=30#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224 do_facet ost1 sysctl -w lustre.fail_loc=0x224 rm -f $DIR/$tfile lfs setstripe $DIR/$tfile --index=0 --count=1 # force some real bulk transfer multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c do_facet ost1 sysctl -w lustre.fail_loc=0 # check for log message $LCTL dk | grep "Early reply #" || error "No early reply" debugrestore # client should show 30s estimates grep portal $LPROC/osc/${FSNAME}-OST0000-osc-*/timeouts}run_test 65b "AT: verify early replies on packed reply / bulk"test_66a() #bug 3055{ at_start || return 0 grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts # adjust 5s at a time so no early reply is sent (within deadline) do_facet mds "sysctl -w lustre.fail_val=5000"#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a do_facet mds "sysctl -w lustre.fail_loc=0x8000050a" createmany -o $DIR/$tfile 20 > /dev/null unlinkmany $DIR/$tfile 20 > /dev/null grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts do_facet mds "sysctl -w lustre.fail_val=10000" do_facet mds "sysctl -w lustre.fail_loc=0x8000050a" createmany -o $DIR/$tfile 20 > /dev/null unlinkmany $DIR/$tfile 20 > /dev/null grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts do_facet mds "sysctl -w lustre.fail_loc=0" sleep 9 createmany -o $DIR/$tfile 20 > /dev/null unlinkmany $DIR/$tfile 20 > /dev/null grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts | grep "portal 12" CUR=$(awk '/portal 12/ {print $5}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts) WORST=$(awk '/portal 12/ {print $7}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts) echo "Current MDT timeout $CUR, worst $WORST" [ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST" }run_test 66a "AT: verify MDT service time adjusts with no early replies"test_66b() #bug 3055{ at_start || return 0 ORIG=$(awk '/network/ {print $4}' $LPROC/mdc/${FSNAME}-*/timeouts) sysctl -w lustre.fail_val=$(($ORIG + 5))#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c sysctl -w lustre.fail_loc=0x50c ls $DIR/$tfile > /dev/null 2>&1 sysctl -w lustre.fail_loc=0 CUR=$(awk '/network/ {print $4}' $LPROC/mdc/${FSNAME}-*/timeouts) WORST=$(awk '/network/ {print $6}' $LPROC/mdc/${FSNAME}-*/timeouts) echo "network timeout orig $ORIG, cur $CUR, worst $WORST" [ $WORST -gt $ORIG ] || error "Worst $WORST should be worse than orig $ORIG" }run_test 66b "AT: verify net latency adjusts"test_67a() #bug 3055{ at_start || return 0 CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) # sleeping threads may drive values above this do_facet ost1 "sysctl -w lustre.fail_val=400"#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a do_facet ost1 "sysctl -w lustre.fail_loc=0x50a" createmany -o $DIR/$tfile 20 > /dev/null unlinkmany $DIR/$tfile 20 > /dev/null do_facet ost1 "sysctl -w lustre.fail_loc=0" CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) ATTEMPTS=$(($CONN2 - $CONN1)) echo "$ATTEMPTS osc reconnect attemps on gradual slow" [ $ATTEMPTS -gt 0 ] && error_ignore 13721 "AT should have prevented reconnect" return 0}run_test 67a "AT: verify slow request processing doesn't induce reconnects"test_67b() #bug 3055{ at_start || return 0 CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)#define OBD_FAIL_OST_PAUSE_CREATE 0x223 do_facet ost1 "sysctl -w lustre.fail_val=20000" do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223" cp /etc/profile $DIR/$tfile || error "cp failed" client_reconnect cat $LPROC/ost/OSS/ost_create/timeouts log "phase 2" CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) ATTEMPTS=$(($CONN2 - $CONN1)) echo "$ATTEMPTS osc reconnect attemps on instant slow" # do it again; should not timeout do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223" cp /etc/profile $DIR/$tfile || error "cp failed" do_facet ost1 "sysctl -w lustre.fail_loc=0" client_reconnect cat $LPROC/ost/OSS/ost_create/timeouts CONN3=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) ATTEMPTS=$(($CONN3 - $CONN2)) echo "$ATTEMPTS osc reconnect attemps on 2nd slow" [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect" return 0}run_test 67b "AT: verify instant slowdown doesn't induce reconnects"test_68 () #bug 13813{ at_start || return 0 local ldlm_enqueue_min=$(find /sys -name ldlm_enqueue_min) [ -z "$ldlm_enqueue_min" ] && skip "missing /sys/.../ldlm_enqueue_min" && return 0 local ENQ_MIN=$(cat $ldlm_enqueue_min) echo $TIMEOUT >> $ldlm_enqueue_min rm -f $DIR/${tfile}_[1-2] lfs setstripe $DIR/$tfile --index=0 --count=1#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 sysctl -w lustre.fail_val=$(($TIMEOUT - 1)) sysctl -w lustre.fail_loc=0x80000312 cp /etc/profile $DIR/${tfile}_1 || error "1st cp failed $?" sysctl -w lustre.fail_val=$((TIMEOUT * 3 / 2)) sysctl -w lustre.fail_loc=0x80000312 cp /etc/profile $DIR/${tfile}_2 || error "2nd cp failed $?" sysctl -w lustre.fail_loc=0 echo $ENQ_MIN >> $ldlm_enqueue_min return 0}run_test 68 "AT: verify slowing locks"if [ -n "$ATOLDBASE" ]; then at_history=$(do_facet mds "find /sys/ -name at_history") do_facet mds "echo $ATOLDBASE >> $at_history" || true do_facet ost1 "echo $ATOLDBASE >> $at_history" || truefiif [ $AT_MAX_SET -ne 0 ]; then echo "restore AT status to be disabled" at_max_set 0 mds ost clientfi# end of AT tests includes above linesequals_msg `basename $0`: test complete, cleaning upcheck_and_cleanup_lustre[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -