Fix preview of imgur and Tenor URLs. (#11669)

By scraping Open Graph information from the HTML even when an autodiscovery endpoint is found. The results are then combined to capture as much information as possible from the page.
3 years ago · 15ffc4143c
parent 9eab71aa93
commit 15ffc4143c
4 changed files with 39 additions and 14 deletions
--- a/changelog.d/11669.bugfix
+++ b/changelog.d/11669.bugfix
@ -0,0 +1 @@
+Fix preview of some gif URLs (like tenor.com). Contributed by Philippe Daouadi.
--- a/docs/development/url_previews.md
+++ b/docs/development/url_previews.md
@ -35,7 +35,12 @@ When Synapse is asked to preview a URL it does the following:
   5. If the media is HTML:
      1. Decodes the HTML via the stored file.
      2. Generates an Open Graph response from the HTML.
-      3. If an image exists in the Open Graph response:
+      3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
+         1. Downloads the URL and stores it into a file via the media storage provider
+            and saves the local media metadata.
+         2. Convert the oEmbed response to an Open Graph response.
+         3. Override any Open Graph data from the HTML with data from oEmbed.
+      4. If an image exists in the Open Graph response:
         1. Downloads the URL and stores it into a file via the media storage
            provider and saves the local media metadata.
         2. Generates thumbnails.
--- a/synapse/rest/media/v1/oembed.py
+++ b/synapse/rest/media/v1/oembed.py
@ -33,6 +33,8 @@ logger = logging.getLogger(__name__)
 class OEmbedResult:
    # The Open Graph result (converted from the oEmbed result).
    open_graph_result: JsonDict
+    # The author_name of the oEmbed result
+    author_name: Optional[str]
    # Number of milliseconds to cache the content, according to the oEmbed response.
    #
    # This will be None if no cache-age is provided in the oEmbed response (or
@ -154,11 +156,12 @@ class OEmbedProvider:
                "og:url": url,
            }

-            # Use either title or author's name as the title.
-            title = oembed.get("title") or oembed.get("author_name")
+            title = oembed.get("title")
            if title:
                open_graph_response["og:title"] = title

+            author_name = oembed.get("author_name")
+
            # Use the provider name and as the site.
            provider_name = oembed.get("provider_name")
            if provider_name:
@ -193,9 +196,10 @@ class OEmbedProvider:
            # Trap any exception and let the code follow as usual.
            logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
            open_graph_response = {}
+            author_name = None
            cache_age = None

-        return OEmbedResult(open_graph_response, cache_age)
+        return OEmbedResult(open_graph_response, author_name, cache_age)


 def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@ -262,6 +262,7 @@ class PreviewUrlResource(DirectServeJsonResource):

        # The number of milliseconds that the response should be considered valid.
        expiration_ms = media_info.expires
+        author_name: Optional[str] = None

        if _is_media(media_info.media_type):
            file_id = media_info.filesystem_id
@ -294,17 +295,25 @@ class PreviewUrlResource(DirectServeJsonResource):
                # Check if this HTML document points to oEmbed information and
                # defer to that.
                oembed_url = self._oembed.autodiscover_from_html(tree)
-                og = {}
+                og_from_oembed: JsonDict = {}
                if oembed_url:
                    oembed_info = await self._download_url(oembed_url, user)
-                    og, expiration_ms = await self._handle_oembed_response(
+                    (
+                        og_from_oembed,
+                        author_name,
+                        expiration_ms,
+                    ) = await self._handle_oembed_response(
                        url, oembed_info, expiration_ms
                    )

-                # If there was no oEmbed URL (or oEmbed parsing failed), attempt
-                # to generate the Open Graph information from the HTML.
-                if not oembed_url or not og:
-                    og = parse_html_to_open_graph(tree, media_info.uri)
+                # Parse Open Graph information from the HTML in case the oEmbed
+                # response failed or is incomplete.
+                og_from_html = parse_html_to_open_graph(tree, media_info.uri)
+
+                # Compile the Open Graph response by using the scraped
+                # information from the HTML and overlaying any information
+                # from the oEmbed response.
+                og = {**og_from_html, **og_from_oembed}

                await self._precache_image_url(user, media_info, og)
            else:
@ -312,7 +321,7 @@ class PreviewUrlResource(DirectServeJsonResource):

        elif oembed_url:
            # Handle the oEmbed information.
-            og, expiration_ms = await self._handle_oembed_response(
+            og, author_name, expiration_ms = await self._handle_oembed_response(
                url, media_info, expiration_ms
            )
            await self._precache_image_url(user, media_info, og)
@ -321,6 +330,11 @@ class PreviewUrlResource(DirectServeJsonResource):
            logger.warning("Failed to find any OG data in %s", url)
            og = {}

+        # If we don't have a title but we have author_name, copy it as
+        # title
+        if not og.get("og:title") and author_name:
+            og["og:title"] = author_name
+
        # filter out any stupidly long values
        keys_to_remove = []
        for k, v in og.items():
@ -484,7 +498,7 @@ class PreviewUrlResource(DirectServeJsonResource):

    async def _handle_oembed_response(
        self, url: str, media_info: MediaInfo, expiration_ms: int
-    ) -> Tuple[JsonDict, int]:
+    ) -> Tuple[JsonDict, Optional[str], int]:
        """
        Parse the downloaded oEmbed info.

@ -497,11 +511,12 @@ class PreviewUrlResource(DirectServeJsonResource):
        Returns:
            A tuple of:
                The Open Graph dictionary, if the oEmbed info can be parsed.
+                The author name if it could be retrieved from oEmbed.
                The (possibly updated) length of time, in milliseconds, the media is valid for.
        """
        # If JSON was not returned, there's nothing to do.
        if not _is_json(media_info.media_type):
-            return {}, expiration_ms
+            return {}, None, expiration_ms

        with open(media_info.filename, "rb") as file:
            body = file.read()
@ -513,7 +528,7 @@ class PreviewUrlResource(DirectServeJsonResource):
        if open_graph_result and oembed_response.cache_age is not None:
            expiration_ms = oembed_response.cache_age

-        return open_graph_result, expiration_ms
+        return open_graph_result, oembed_response.author_name, expiration_ms

    def _start_expire_url_cache_data(self) -> Deferred:
        return run_as_background_process(
				`@ -0,0 +1 @@`
				`Fix preview of some gif URLs (like tenor.com). Contributed by Philippe Daouadi.`