@ -386,7 +386,7 @@ class PreviewUrlResource(DirectServeJsonResource):
"""
Check whether the URL should be downloaded as oEmbed content instead .
Param s:
Arg s:
url : The URL to check .
Returns :
@ -403,7 +403,7 @@ class PreviewUrlResource(DirectServeJsonResource):
"""
Request content from an oEmbed endpoint .
Param s:
Arg s:
endpoint : The oEmbed API endpoint .
url : The URL to pass to the API .
@ -692,27 +692,51 @@ class PreviewUrlResource(DirectServeJsonResource):
def decode_and_calc_og (
body : bytes , media_uri : str , request_encoding : Optional [ str ] = None
) - > Dict [ str , Optional [ str ] ] :
"""
Calculate metadata for an HTML document .
This uses lxml to parse the HTML document into the OG response . If errors
occur during processing of the document , an empty response is returned .
Args :
body : The HTML document , as bytes .
media_url : The URI used to download the body .
request_encoding : The character encoding of the body , as a string .
Returns :
The OG response as a dictionary .
"""
# If there's no body, nothing useful is going to be found.
if not body :
return { }
from lxml import etree
# Create an HTML parser. If this fails, log and return no metadata.
try :
parser = etree . HTMLParser ( recover = True , encoding = request_encoding )
tree = etree . fromstring ( body , parser )
og = _calc_og ( tree , media_uri )
except LookupError :
# blindly consider the encoding as utf-8.
parser = etree . HTMLParser ( recover = True , encoding = " utf-8 " )
except Exception as e :
logger . warning ( " Unable to create HTML parser: %s " % ( e , ) )
return { }
def _attempt_calc_og ( body_attempt : Union [ bytes , str ] ) - > Dict [ str , Optional [ str ] ] :
# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree . fromstring ( body_attempt , parser )
return _calc_og ( tree , media_uri )
# Attempt to parse the body. If this fails, log and return no metadata.
try :
return _attempt_calc_og ( body )
except UnicodeDecodeError :
# blindly try decoding the body as utf-8, which seems to fix
# the charset mismatches on https://google.com
parser = etree . HTMLParser ( recover = True , encoding = request_encoding )
tree = etree . fromstring ( body . decode ( " utf-8 " , " ignore " ) , parser )
og = _calc_og ( tree , media_uri )
return og
return _attempt_calc_og ( body . decode ( " utf-8 " , " ignore " ) )
def _calc_og ( tree , media_uri : str ) - > Dict [ str , Optional [ str ] ] :
def _calc_og ( tree : " etree.Element " , media_uri : str ) - > Dict [ str , Optional [ str ] ] :
# suck our tree into lxml and define our OG response.
# if we see any image URLs in the OG response, then spider them