Improve bad-split-repair on multiday intervals

Improve bad-split-repair on multiday intervals Switch some repair log msgs from warning -> info
2023-07-31 18:19:42 +01:00 · 2023-07-31 18:19:42 +01:00 · 93b6e024da
parent 4a1e1c4447
commit 93b6e024da
1 changed files with 33 additions and 31 deletions
--- a/yfinance/base.py
+++ b/yfinance/base.py
@ -619,7 +619,7 @@ class TickerBase:
                else:
                    msg += f" {start_d}"
                msg += ", too old, Yahoo will reject request for finer-grain data"
-                logger.warning(msg)
+                logger.info(msg)
                continue

            td_1d = _datetime.timedelta(days=1)
@ -703,7 +703,7 @@ class TickerBase:
            common_index = _np.intersect1d(df_block.index, df_new.index)
            if len(common_index) == 0:
                # Can't calibrate so don't attempt repair
-                logger.warning(f"Can't calibrate {interval} block starting {start_d} so aborting repair")
+                logger.info(f"Can't calibrate {interval} block starting {start_d} so aborting repair")
                continue
            # First, attempt to calibrate the 'Adj Close' column. OK if cannot.
            # Only necessary for 1d interval, because the 1h data is not div-adjusted.
@ -752,7 +752,7 @@ class TickerBase:
            calib_filter = (df_block_calib != tag)
            if not calib_filter.any():
                # Can't calibrate so don't attempt repair
-                logger.warning(f"Can't calibrate {interval} block starting {start_d} so aborting repair")
+                logger.info(f"Can't calibrate {interval} block starting {start_d} so aborting repair")
                continue
            # Avoid divide-by-zero warnings:
            for j in range(len(calib_cols)):
@ -858,7 +858,7 @@ class TickerBase:
            return df
        if df.shape[0] == 1:
            # Need multiple rows to confidently identify outliers
-            logger.warning("price-repair-100x: Cannot check single-row table for 100x price errors")
+            logger.info("price-repair-100x: Cannot check single-row table for 100x price errors")
            if not "Repaired?" in df.columns:
                df["Repaired?"] = False
            return df
@ -883,7 +883,7 @@ class TickerBase:
        else:
            df2_zeroes = None
        if df2.shape[0] <= 1:
-            logger.warning("price-repair-100x: Insufficient good data for detecting 100x price errors")
+            logger.info("price-repair-100x: Insufficient good data for detecting 100x price errors")
            if not "Repaired?" in df.columns:
                df["Repaired?"] = False
            return df
@ -1053,7 +1053,7 @@ class TickerBase:
            return df
        if f_prices_bad.sum() == len(price_cols)*len(df2):
            # Need some good data to calibrate
-            logger.warning("price-repair-missing: No good data for calibration so cannot fix price=0 bad data")
+            logger.info("price-repair-missing: No good data for calibration so cannot fix price=0 bad data")
            if not "Repaired?" in df.columns:
                df["Repaired?"] = False
            return df
@ -1240,8 +1240,6 @@ class TickerBase:
        # If stock is currently suspended and not in USA, then usually Yahoo introduces
        # 100x errors into suspended intervals. Clue is no price change and 0 volume.
        # Better to use last active trading interval as baseline.
-        # But can't be smart
-        # But can only do with 1d:
        f_no_activity = (df2['Low'] == df2['High']) & (df2['Volume']==0)
        appears_suspended = f_no_activity.any() and _np.where(f_no_activity)[0][0]==0
        f_active = ~f_no_activity
@ -1258,7 +1256,7 @@ class TickerBase:
            debug_cols = ['Low', 'High']

        # Calculate daily price % change. To reduce effect of price volatility, 
-        # calculate change for each OHLC column and use average.
+        # calculate change for each OHLC column.
        if interday and interval != '1d' and split not in [100.0, 100, 0.001]:
            # Avoid using 'Low' and 'High'. For multiday intervals, these can be 
            # very volatile so reduce ability to detect genuine stock split errors
@ -1365,11 +1363,14 @@ class TickerBase:
            logger.info(f'price-repair-split: No {fix_type}s detected')
            return df

-        if logger.isEnabledFor(logging.DEBUG):
-            logger.debug(f"price-repair-split: my workings:")
-            logger.debug('\n' + str(df_debug))
+        # if logger.isEnabledFor(logging.DEBUG):
+        #     logger.debug(f"price-repair-split: my workings:")
+        #     logger.debug('\n' + str(df_debug))

        def map_signals_to_ranges(f, f_up, f_down):
+            if not f.any():
+                return []
+
            true_indices = _np.where(f)[0]
            ranges = []

@ -1382,7 +1383,7 @@ class TickerBase:
                if idx_latest_active is None:
                    true_indices_old = []
                else:
-                    true_indices_old = [i for i in true_indices if i >= idx_latest_active]
+                    true_indices_old = [i for i in true_indices if i > idx_latest_active]
                    if len(true_indices_old) > 0:
                        for i in range(len(true_indices_old) - 1):
                            if i % 2 == 0:
@ -1401,26 +1402,27 @@ class TickerBase:

                # Next, process prices more recent than idx_latest_active:
                true_indices_recent = [i for i in true_indices if i not in true_indices_old]
-                if split > 1.0:
-                    adj = 'split' if f_up[true_indices_recent[0]] else '1.0/split'
-                else:
-                    adj = '1.0/split' if f_up[true_indices_recent[0]] else 'split'
-                ranges.append((0, true_indices_recent[0], adj))
-
-                for i in range(1, len(true_indices_recent) - 1):
-                    if i % 2 == 1:
-                        if split > 1.0:
-                            adj = '1.0/split' if f_up[true_indices_recent[i]] else 'split'
-                        else:
-                            adj = 'split' if f_up[true_indices_recent[i]] else '1.0/split'
-                        ranges.append((true_indices_recent[i], true_indices_recent[i+1], adj))
-
-                if len(true_indices_recent) % 2 == 0:
+                if len(true_indices_recent) > 0:
                    if split > 1.0:
-                        adj = 'split' if f_down[true_indices_recent[-1]] else '1.0/split'
+                        adj = 'split' if f_up[true_indices_recent[0]] else '1.0/split'
                    else:
-                        adj = '1.0/split' if f_down[true_indices_recent[-1]] else 'split'
-                    ranges.append((true_indices_recent[-1], len(f), adj))
+                        adj = '1.0/split' if f_up[true_indices_recent[0]] else 'split'
+                    ranges.append((0, true_indices_recent[0], adj))
+
+                    for i in range(1, len(true_indices_recent) - 1):
+                        if i % 2 == 1:
+                            if split > 1.0:
+                                adj = '1.0/split' if f_up[true_indices_recent[i]] else 'split'
+                            else:
+                                adj = 'split' if f_up[true_indices_recent[i]] else '1.0/split'
+                            ranges.append((true_indices_recent[i], true_indices_recent[i+1], adj))
+
+                    if len(true_indices_recent) % 2 == 0:
+                        if split > 1.0:
+                            adj = 'split' if f_down[true_indices_recent[-1]] else '1.0/split'
+                        else:
+                            adj = '1.0/split' if f_down[true_indices_recent[-1]] else 'split'
+                        ranges.append((true_indices_recent[-1], len(f), adj))

                ranges = sorted(ranges, key=lambda x: x[0])