Monday, February 28, 2011

How can I confirm there was an I/O problem (2) ?

In a previous post, I have mentioned that I have tried to mine AWR data to confirm an I/O issue occurred around 2:18 PM of Feb 18; however I encountered some issue and instead found out 'sar' was helpful. Today, I worked out the query against AWR tables. The query is as follows:

rem  awr_io_file_smpl.sql  
rem   -- Adapted from the scripts written by Karl Arao: http://karlarao.wordpress.com
rem
rem  check if there are I/O problem from AWR. 
rem  normally atpr (Average time per read)  should be less than 20 ms
rem  if atpr > 100 ms, it obviously indicates that we have problem
rem   
rem  note: the statistics are based on  (tm-interval, tm)
rem    


col atpr format 999,999
col atpw format 999,999

set echo off
-- set markup HTML ON
-- SPOOL ON ENTMAP ON PREFORMAT OFF
-- spool awr_io_file_smpl.html
spool awr_io_file_smpl.log
select * from (
SELECT 
--      s0.snap_id snap_id,
       to_char(s1.END_INTERVAL_TIME, 'YYMMDD HH24:MI') tm,
--      s0.instance_number inst,
       ROUND(EXTRACT(DAY FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) * 1440 + EXTRACT(HOUR FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) * 60 + EXTRACT(MINUTE FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) + EXTRACT(SECOND FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) / 60, 2) dur,
--      e.tsname,
      e.file# ,
--      SUBSTR(e.filename, 1, 52) filename ,
--      e.readtim         - NVL(b.readtim,0) readtim ,
      e.phyrds          - NVL(b.phyrds,0) reads ,
      DECODE ((e.phyrds - NVL(b.phyrds, 0)), 0, to_number(NULL), ((e.readtim - NVL(b.readtim,0)) 
                   / (e.phyrds - NVL(b.phyrds,0)))*10) atpr ,
--      (e.phyrds         - NVL(b.phyrds,0)) / ((ROUND(EXTRACT(DAY FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) * 1440 + EXTRACT(HOUR FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) * 60 + EXTRACT(MINUTE FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) + EXTRACT(SECOND FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) / 60, 2))*60) rps  ,
--      DECODE ((e.phyrds  - NVL(b.phyrds, 0)), 0, to_number(NULL), (e.phyblkrd - NVL(b.phyblkrd,0)) / (e.phyrds - NVL(b.phyrds,0)) ) bpr ,
--      e.writetim         - NVL(b.writetim,0) writetim ,
      e.phywrts          - NVL(b.phywrts,0) writes ,
      DECODE ((e.phywrts - NVL(b.phywrts, 0)), 0, to_number(NULL), ((e.writetim - NVL(b.writetim,0)) / (e.phywrts - NVL(b.phywrts,0)))*10) atpw ,
--      (e.phywrts         - NVL(b.phywrts,0)) / ((ROUND(EXTRACT(DAY FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) * 1440 + EXTRACT(HOUR FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) * 60 + EXTRACT(MINUTE FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) + EXTRACT(SECOND FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) / 60, 2))*60) wps ,
--      DECODE ((e.phywrts    - NVL(b.phywrts, 0)), 0, to_number(NULL), (e.phyblkwrt - NVL(b.phyblkwrt,0)) / (e.phywrts - NVL(b.phywrts,0)) ) bpw ,
--      e.wait_count          - NVL(b.wait_count,0) waits ,
--      DECODE ((e.wait_count - NVL(b.wait_count, 0)), 0, 0, ((e.time - NVL(b.time,0)) / (e.wait_count - NVL(b.wait_count,0)))*10) atpwt,
      (e.phyrds             - NVL(b.phyrds,0)) + (e.phywrts - NVL(b.phywrts,0)) ios,
      ((e.phyrds            - NVL(b.phyrds,0)) + (e.phywrts - NVL(b.phywrts,0))) / ((ROUND(EXTRACT(DAY FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) * 1440 + EXTRACT(HOUR FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) * 60 + EXTRACT(MINUTE FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) + EXTRACT(SECOND FROM s1.END_INTERVAL_TIME - s0.END_INTERVAL_TIME) / 60, 2))*60) iops
    FROM dba_hist_snapshot s0,
      dba_hist_snapshot s1,
      dba_hist_filestatxs e,
      dba_hist_filestatxs b
    WHERE s0.dbid =(select dbid from v$database)
    AND s1.dbid            = s0.dbid
    AND b.dbid(+)          = s0.dbid -- begin dbid
    AND e.dbid             = s0.dbid -- end dbid
    AND b.dbid             = e.dbid  -- remove oj
    AND s0.instance_number =dbms_utility.CURRENT_INSTANCE
    AND s1.instance_number = s0.instance_number
    AND b.instance_number(+) = s0.instance_number 
    AND e.instance_number    = s0.instance_number 
    AND b.instance_number    = e.instance_number 
    AND s1.snap_id           = s0.snap_id + 1
    AND b.snap_id(+)         = s0.snap_id      
    AND e.snap_id            = s0.snap_id + 1 
    AND b.tsname             = e.tsname      
    AND b.file#              = e.file#           
    AND b.creation_change#   = e.creation_change# 
    AND ((e.phyrds - NVL(b.phyrds,0)) + (e.phywrts - NVL(b.phywrts,0))) > 0
    AND s0.END_INTERVAL_TIME >= to_date('2011-02-18 13:00:00', 'YYYY-MM-DD HH24:MI:SS') 
    AND s0.END_INTERVAL_TIME <= to_date('2011-02-18 16:00:00', 'YYYY-MM-DD HH24:MI:SS') 
    order by s1.END_INTERVAL_TIME, e.tsname
--    and b.tsname='xxxx'
)
where atpr > 100 
;
spool off
set markup html off spool off
set termout on



The output of above query is as follows:

TM                  DUR      FILE#      READS     ATPR     WRITES     ATPW        IOS       IOPS
------------ ---------- ---------- ---------- -------- ---------- -------- ---------- ----------
110218 14:00      15.04        268          8      124          1        0          9 .009973404
110218 14:00      15.04        283          7      141          1        0          8 .008865248
110218 14:00      15.04        281          8      118          1        0          9 .009973404
110218 14:00      15.04        296          8      116          1        0          9 .009973404
110218 14:00      15.04        273          8      123          1        0          9 .009973404
110218 14:00      15.04        293          8      123          1        0          9 .009973404
110218 14:00      15.04        288          8      125          1        0          9 .009973404
110218 14:00      15.04        276          8      120          1        0          9 .009973404
110218 14:00      15.04        286          7      133          1        0          8 .008865248
110218 14:00      15.04        278          8      123          1        0          9 .009973404
110218 14:00      15.04        308          8      125          1        0          9 .009973404
110218 14:00      15.04        271          8      116          1        0          9 .009973404
110218 14:00      15.04        248          7      139          1       10          8 .008865248
110218 14:30       14.6        502       3472      205        620      816       4092 4.67123288
110218 14:30       14.6        519        351    1,695         72    1,749        423 .482876712
110218 14:30       14.6        515        664      347         90        6        754 .860730594
110218 14:30       14.6        504       1174      214        114    1,107       1288 1.47031963
110218 14:30       14.6        448       8463      139        550      461       9013 10.2888128
110218 14:30       14.6        412       5772      131        361      353       6133 7.00114155
110218 14:30       14.6        462       5889      215        437      291       6326 7.22146119
110218 14:30       14.6        429      10276      313        356      360      10632 12.1369863
110218 14:30       14.6        427       4255      189        183      691       4438 5.06621005
110218 14:30       14.6        473       7355      298        865      149       8220 9.38356164
110218 14:30       14.6        513        867      148        218      587       1085 1.23858447
110218 14:30       14.6        479       5903      196        986      144       6889 7.86415525
110218 14:30       14.6        475       7702      263       1163      221       8865  10.119863
110218 14:30       14.6        471       9543      155        141      906       9684 11.0547945
110218 14:30       14.6        523       2187      378       1014      253       3201 3.65410959
110218 14:30       14.6        482       6674      150       1536      252       8210 9.37214612
110218 14:30       14.6        437      10793      271       1920      401      12713 14.5125571
110218 14:30       14.6        395       8062      151        555      233       8617 9.83675799
110218 14:30       14.6        464      19537      431       4212      144      23749 27.1107306
110218 14:30       14.6        491      14568      440       4247      220      18815 21.4783105
110218 14:30       14.6        444      20026      385       2541      212      22567 25.7614155
110218 14:30       14.6        481      15496      444       3763      287      19259 21.9851598
110218 14:30       14.6        421      14930      334       1642       85      16572 18.9178082
110218 14:30       14.6        514       5464      597       2542      360       8006 9.13926941
110218 14:30       14.6        435       3804      165        167      751       3971 4.53310502
110218 14:30       14.6        512       2065      237        474      276       2539 2.89840183
110218 14:30       14.6        498       4540      186        508      993       5048 5.76255708
110218 14:30       14.6        478       8505      159       1161      561       9666 11.0342466
110218 14:30       14.6        470       8134      182        454      283       8588 9.80365297
110218 14:30       14.6        460       4841      168        391      325       5232 5.97260274
110218 14:30       14.6        474       6486      130        779      166       7265   8.293379
110218 14:30       14.6        411       3368      291        186      678       3554 4.05707763
110218 14:30       14.6        521       2447      183        543      235       2990 3.41324201
110218 14:30       14.6        418       3837      124        128      991       3965 4.52625571
110218 14:30       14.6        100        160    1,370        670      213        830 .947488584
110218 14:30       14.6        139        408      273       1061      366       1469 1.67694064
110218 15:00      15.03         62          4      103          1       10          5 .005544467
110218 15:30      15.04         69          6      107          2        0          8 .008865248
110218 15:30      15.04         62          6      102          2        5          8 .008865248
110218 15:30      15.04         63          6      102          2        5          8 .008865248
110218 15:30      15.04         64          6      107          2        5          8 .008865248
110218 15:30      15.04         78          6      113          2        5          8 .008865248
110218 15:30      15.04         77          6      107          2       10          8 .008865248
110218 15:30      15.04         66          6      107          2        5          8 .008865248
110218 16:00      15.04        125          6      282          1      180          7 .007757092
110218 16:00      15.04        110          6      282          1       70          7 .007757092

59 rows selected.

From the above output, it can be seen from about 14:15 -14:30, there are many data files having APTR > 100 ms. This clearly indicates there was I/O problem during that period.

No comments: