Refactor check_Yahoo_response() to work with latest decryption

2023-02-07 13:59:14 +00:00 · 2023-02-07 13:59:14 +00:00 · 4d8ca3777a
parent 14c6136699
commit 4d8ca3777a
1 changed files with 132 additions and 121 deletions
--- a/yfinance/data.py
+++ b/yfinance/data.py
@ -177,101 +177,6 @@ def enable_prune_session_cache():
 def disable_prune_session_cache():
    global prune_session_cache
    prune_session_cache = False
-def check_Yahoo_response(r, *args, **kwargs):
-    # Parse the data returned by Yahoo to determine if corrupt/incomplete.
-    # If bad, set 'status_code' to 204 "No content" , that stops it 
-    # entering a requests_cache.
-
-    # Because this involves parsing, the output is added to response object 
-    # with prefix "yf_" and reused elsewhere.
-
-    if not "yahoo.com/" in r.url:
-        # Only check Yahoo responses
-        return
-
-    attrs = dir(r)
-    r_from_cache = "from_cache" in attrs and r.from_cache
-    if "yf_data" in attrs or "yf_json" in attrs or "yf_html_pd" in attrs:
-        # Have already parsed this response, successfully
-        return
-
-    if "Will be right back" in r.text:
-        # Simple check, no parsing needed
-        r.status_code = 204
-        return r
-
-    parse_failed = False
-    r_modified = False
-
-    if "/ws/fundamentals-timeseries" in r.url:
-        # Timeseries
-        try:
-            data = r.json()
-            r.yf_json = data
-            r_modified = True
-            data["timeseries"]["result"]
-        except:
-            parse_failed = True
-    elif "/finance/chart/" in r.url:
-        # Prices
-        try:
-            data = r.json()
-            r.yf_json = data
-            r_modified = True
-            if data["chart"]["error"] is not None:
-                parse_failed = True
-        except Exception:
-            parse_failed = True
-    elif "/finance/options/" in r.url:
-        # Options
-        if not "expirationDates" in r.text:
-            # Parse will fail
-            parse_failed = True
-    elif "/finance/search?" in r.url:
-        # News, can't be bothered to check
-        return
-    elif "/calendar/earnings?" in r.url:
-        try:
-            dfs = _pd.read_html(r.text)
-        except Exception:
-            parse_failed = True
-        else:
-            r.yf_html_pd = dfs
-            r_modified = True
-    elif "root.App.main" in r.text:
-        # JSON data stores
-        try:
-            json_str = r.text.split('root.App.main =')[1].split(
-                '(this)')[0].split(';\n}')[0].strip()
-        except IndexError:
-            parse_failed = True
-
-        if not parse_failed:
-            data = json.loads(json_str)
-            if "_cs" in data and "_cr" in data:
-                data = decrypt_cryptojs_aes(data)
-            if "context" in data and "dispatcher" in data["context"]:
-                # Keep old code, just in case
-                data = data['context']['dispatcher']['stores']
-
-            if not "yf_data" in attrs:
-                r.yf_data = data
-                r_modified = True
-
-            if "QuoteSummaryStore" not in data:
-                parse_failed = True
-
-    else:
-        return
-
-    if parse_failed:
-        if not r_from_cache:
-            r.status_code = 204  # No content
-            r_modified = True
-
-    if r_modified:
-        return r
-

 class TickerData:
    """
@ -284,7 +189,7 @@ class TickerData:
        self.ticker = ticker
        self._session = session or requests

-    def check_requests_cache_hook(self):
+    def _check_requests_cache_hook(self):
        try:
            c = self._session.cache
        except AttributeError:
@ -293,12 +198,10 @@ class TickerData:
        global prune_session_cache
        if not prune_session_cache:
            self._session.hooks["response"] = []
-        elif prune_session_cache and not check_Yahoo_response in self._session.hooks["response"]:
-            self._session.hooks["response"].append(check_Yahoo_response)
+        elif prune_session_cache and not self._check_Yahoo_response in self._session.hooks["response"]:
+            self._session.hooks["response"].append(self._check_Yahoo_response)

    def get(self, url, user_agent_headers=None, params=None, proxy=None, timeout=30):
-        self.check_requests_cache_hook()
-
        proxy = self._get_proxy(proxy)
        response = self._session.get(
            url=url,
@ -381,6 +284,132 @@ class TickerData:

        return []

+    def _gather_keys_from_response(self, response):
+        # Gather decryption keys:
+        soup = BeautifulSoup(response.content, "html.parser")
+        keys = self._get_decryption_keys_from_yahoo_js(soup)
+        if len(keys) == 0:
+            msg = "No decryption keys could be extracted from JS file."
+            if "requests_cache" in str(type(response)):
+                msg += " Try flushing your 'requests_cache', probably parsing old JS."
+            print("WARNING: " + msg + " Falling back to backup decrypt methods.")
+        if len(keys) == 0:
+            keys = []
+            try:
+                extra_keys = _extract_extra_keys_from_stores(data)
+                keys = [''.join(extra_keys[-4:])]
+            except:
+                pass
+            #
+            keys_url = "https://github.com/ranaroussi/yfinance/raw/main/yfinance/scrapers/yahoo-keys.txt"
+            response_gh = self.cache_get(keys_url)
+            keys += response_gh.text.splitlines()
+        return keys
+
+    def _check_Yahoo_response(self, r, *args, **kwargs):
+        # Parse the data returned by Yahoo to determine if corrupt/incomplete.
+        # If bad, set 'status_code' to 204 "No content" , that stops it 
+        # entering a requests_cache.
+
+        # Because this involves parsing, the output is added to response object 
+        # with prefix "yf_" and reused elsewhere.
+
+        if not "yahoo.com/" in r.url:
+            # Only check Yahoo responses
+            return
+
+        attrs = dir(r)
+        r_from_cache = "from_cache" in attrs and r.from_cache
+        if "yf_data" in attrs or "yf_json" in attrs or "yf_html_pd" in attrs:
+            # Have already parsed this response, successfully
+            return
+
+        if "Will be right back" in r.text:
+            # Simple check, no parsing needed
+            r.status_code = 204
+            return r
+
+        parse_failed = False
+        r_modified = False
+
+        if "/ws/fundamentals-timeseries" in r.url:
+            # Timeseries
+            try:
+                data = r.json()
+                r.yf_json = data
+                r_modified = True
+                data["timeseries"]["result"]
+            except:
+                parse_failed = True
+        elif "/finance/chart/" in r.url:
+            # Prices
+            try:
+                data = r.json()
+                r.yf_json = data
+                r_modified = True
+                if data["chart"]["error"] is not None:
+                    parse_failed = True
+            except Exception:
+                parse_failed = True
+        elif "/finance/options/" in r.url:
+            # Options
+            if not "expirationDates" in r.text:
+                # Parse will fail
+                parse_failed = True
+        elif "/finance/search?" in r.url:
+            # News, can't be bothered to check
+            return
+        elif "/calendar/earnings?" in r.url:
+            try:
+                dfs = _pd.read_html(r.text)
+            except Exception:
+                parse_failed = True
+            else:
+                r.yf_html_pd = dfs
+                r_modified = True
+        elif "root.App.main" in r.text:
+            # JSON data stores
+            try:
+                json_str = r.text.split('root.App.main =')[1].split(
+                    '(this)')[0].split(';\n}')[0].strip()
+            except IndexError:
+                parse_failed = True
+
+            if not parse_failed:
+                data = json.loads(json_str)
+
+                keys = self._gather_keys_from_response(r)
+
+                # Decrypt!
+                stores = decrypt_cryptojs_aes_stores(data, keys)
+                if stores is None:
+                    # Maybe Yahoo returned old format, not encrypted
+                    if "context" in data and "dispatcher" in data["context"]:
+                        stores = data['context']['dispatcher']['stores']
+                if stores is None:
+                    # raise Exception(f"{self.ticker}: Failed to extract data stores from web request")
+                    print(f"{self.ticker}: Failed to decrypt/extract data stores from web request")
+                    parse_failed = True
+
+                if "yf_data" not in attrs:
+                # if not parse_failed and "yf_data" not in attrs:
+                    r.yf_data = stores
+                    r_modified = True
+
+                if stores is not None and "QuoteSummaryStore" not in stores:
+                    parse_failed = True
+
+        else:
+            return
+
+        if parse_failed:
+            if not r_from_cache:
+                r.status_code = 204  # No content
+                r_modified = True
+
+        if r_modified:
+            return r
+
    @lru_cache_freezeargs
    @lru_cache(maxsize=cache_maxsize)
    def get_json_data_stores(self, sub_page: str = None, proxy=None) -> dict:
@ -393,12 +422,12 @@ class TickerData:
            ticker_url = "{}/{}".format(_SCRAPE_URL_, self.ticker)

        # Ensure hook ready to intercept get responses
-        self.check_requests_cache_hook()
+        self._check_requests_cache_hook()

        response = self.get(url=ticker_url, proxy=proxy)

        if "yf_data" in dir(response):
-            # check_requests_cache_hook() already successfully extracted & decrypted
+            # _check_requests_cache_hook() already successfully extracted & decrypted
            stores = response.yf_data
        else:
            # Extract JSON and decrypt
@ -416,25 +445,7 @@ class TickerData:

            data = json.loads(json_str)

-            # Gather decryption keys:
-            soup = BeautifulSoup(response.content, "html.parser")
-            keys = self._get_decryption_keys_from_yahoo_js(soup)
-            if len(keys) == 0:
-                msg = "No decryption keys could be extracted from JS file."
-                if "requests_cache" in str(type(response)):
-                    msg += " Try flushing your 'requests_cache', probably parsing old JS."
-                print("WARNING: " + msg + " Falling back to backup decrypt methods.")
-            if len(keys) == 0:
-                keys = []
-                try:
-                    extra_keys = _extract_extra_keys_from_stores(data)
-                    keys = [''.join(extra_keys[-4:])]
-                except:
-                    pass
-                #
-                keys_url = "https://github.com/ranaroussi/yfinance/raw/main/yfinance/scrapers/yahoo-keys.txt"
-                response_gh = self.cache_get(keys_url)
-                keys += response_gh.text.splitlines()
+            keys = self._gather_keys_from_response(response)

            # Decrypt!
            stores = decrypt_cryptojs_aes_stores(data, keys)