PATCH: Add uri percent-encoding for binary data

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
5 messages Options
Reply | Threaded
Open this post in threaded view
|

PATCH: Add uri percent-encoding for binary data

Anders Åstrand-2
Hello

Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).

The background for this patch is that I could easily build urls in
plpgsql, but doing the actual encoding of the url parts is painfully
slow. The list of available encodings for encode/decode looks quite
arbitrary to me, so I can't see any reason this one couldn't be in
there.

In modern web scenarios one would probably most likely want to encode
the utf8 representation of a text string for inclusion in a url, in
which case correct invocation would be ENCODE(CONVERT_TO('some text in
database encoding goes here', 'UTF8'), 'uri'), but uri
percent-encoding can of course also be used for other text encodings
and arbitrary binary data.

Regards,
Anders

uri-encoding-v1.patch (5K) Download Attachment
Reply | Threaded
Open this post in threaded view
|

Re: PATCH: Add uri percent-encoding for binary data

Bruce Momjian
On Mon, Oct  7, 2019 at 09:14:38AM +0200, Anders Åstrand wrote:
> Hello
>
> Attached is a patch for adding uri as an encoding option for
> encode/decode. It uses what's called "percent-encoding" in rfc3986
> (https://tools.ietf.org/html/rfc3986#section-2.1).

Oh, that's a cool idea.  Can you add it to the commit-fest?

        https://commitfest.postgresql.org/25/

---------------------------------------------------------------------------


>
> The background for this patch is that I could easily build urls in
> plpgsql, but doing the actual encoding of the url parts is painfully
> slow. The list of available encodings for encode/decode looks quite
> arbitrary to me, so I can't see any reason this one couldn't be in
> there.
>
> In modern web scenarios one would probably most likely want to encode
> the utf8 representation of a text string for inclusion in a url, in
> which case correct invocation would be ENCODE(CONVERT_TO('some text in
> database encoding goes here', 'UTF8'), 'uri'), but uri
> percent-encoding can of course also be used for other text encodings
> and arbitrary binary data.
>
> Regards,
> Anders

> diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
> index 7293d66de5..33cf7bb57c 100644
> --- a/src/backend/utils/adt/encode.c
> +++ b/src/backend/utils/adt/encode.c
> @@ -512,6 +512,131 @@ esc_dec_len(const char *src, unsigned srclen)
>   return len;
>  }
>  
> +/*
> + * URI percent encoding
> + *
> + * Percent encodes all byte values except the unreserved ASCII characters as per RFC3986.
> + */
> +
> +static const char upper_hex_digits[] = "0123456789ABCDEF";
> +
> +static unsigned
> +uri_encode(const char *src, unsigned srclen, char *dst)
> +{
> + char *d = dst;
> +
> + for (const char *s = src; s < src + srclen; s++)
> + {
> + if ((*s >= 'A' && *s <= 'Z') ||
> + (*s >= 'a' && *s <= 'z') ||
> + (*s >= '0' && *s <= '9') ||
> + *s == '-' ||
> + *s == '_' ||
> + *s == '.' ||
> + *s == '~')
> + {
> + *d++ = *s;
> + }
> + else
> + {
> + *d++ = '%';
> + *d++ = upper_hex_digits[(*s >> 4) & 0xF];
> + *d++ = upper_hex_digits[*s & 0xF];
> + }
> + }
> + return d - dst;
> +}
> +
> +static unsigned
> +uri_decode(const char *src, unsigned srclen, char *dst)
> +{
> + const char *s = src;
> + const char *srcend = src + srclen;
> + char *d = dst;
> + char val;
> +
> + while (s < srcend)
> + {
> + if (*s == '%')
> + {
> + if (s > srcend - 3) {
> + /* This will never get triggered since uri_dec_len already takes care of validation
> + */
> + ereport(ERROR,
> + (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
> + errmsg("invalid uri percent encoding"),
> + errhint("Input data ends prematurely.")));
> + }
> +
> + /* Skip '%' */
> + s++;
> +
> + val = get_hex(*s++) << 4;
> + val += get_hex(*s++);
> + *d++ = val;
> + }
> + else
> + {
> + *d++ = *s++;
> + }
> + }
> + return d - dst;
> +}
> +
> +static unsigned
> +uri_enc_len(const char *src, unsigned srclen)
> +{
> + int len = 0;
> +
> + for (const char *s = src; s < src + srclen; s++)
> + {
> + if ((*s >= 'A' && *s <= 'Z') ||
> + (*s >= 'a' && *s <= 'z') ||
> + (*s >= '0' && *s <= '9') ||
> + *s == '-' ||
> + *s == '_' ||
> + *s == '.' ||
> + *s == '~')
> + {
> + len++;
> + }
> + else
> + {
> + len += 3;
> + }
> + }
> + return len;
> +}
> +
> +static unsigned
> +uri_dec_len(const char *src, unsigned srclen)
> +{
> + const char *s = src;
> + const char *srcend = src + srclen;
> + int len = 0;
> +
> + while (s < srcend)
> + {
> + if (*s == '%')
> + {
> + if (s > srcend - 3) {
> + ereport(ERROR,
> + (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
> + errmsg("invalid uri percent encoding"),
> + errhint("Input data ends prematurely.")));
> + }
> + s++;
> + get_hex(*s++);
> + get_hex(*s++);
> + }
> + else {
> + s++;
> + }
> + len++;
> + }
> + return len;
> +}
> +
>  /*
>   * Common
>   */
> @@ -541,6 +666,12 @@ static const struct
>   esc_enc_len, esc_dec_len, esc_encode, esc_decode
>   }
>   },
> + {
> + "uri",
> + {
> + uri_enc_len, uri_dec_len, uri_encode, uri_decode
> + }
> + },
>   {
>   NULL,
>   {
> diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
> index 2483966576..f89c5ec1c3 100644
> --- a/src/test/regress/expected/strings.out
> +++ b/src/test/regress/expected/strings.out
> @@ -1870,3 +1870,24 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
>   Th\000o\x02\x03
>  (1 row)
>  
> +SET bytea_output TO hex;
> +SELECT encode(E'en\\300\\336d'::bytea, 'uri');
> +  encode  
> +-----------
> + en%C0%DEd
> +(1 row)
> +
> +SELECT decode('%De%c0%DEd', 'uri');
> +   decode  
> +------------
> + \xdec0de64
> +(1 row)
> +
> +SELECT decode('error%Ex', 'uri');
> +ERROR:  invalid hexadecimal digit: "x"
> +SELECT decode('error%E', 'uri');
> +ERROR:  invalid uri percent encoding
> +HINT:  Input data ends prematurely.
> +SELECT decode('error%', 'uri');
> +ERROR:  invalid uri percent encoding
> +HINT:  Input data ends prematurely.
> diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
> index b5e75c344f..1d03836b6e 100644
> --- a/src/test/regress/sql/strings.sql
> +++ b/src/test/regress/sql/strings.sql
> @@ -641,3 +641,10 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
>  SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
>  SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
>  SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
> +
> +SET bytea_output TO hex;
> +SELECT encode(E'en\\300\\336d'::bytea, 'uri');
> +SELECT decode('%De%c0%DEd', 'uri');
> +SELECT decode('error%Ex', 'uri');
> +SELECT decode('error%E', 'uri');
> +SELECT decode('error%', 'uri');


--
  Bruce Momjian  <[hidden email]>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

+ As you are, so once was I.  As I am, so you will be. +
+                      Ancient Roman grave inscription +


Reply | Threaded
Open this post in threaded view
|

Re: PATCH: Add uri percent-encoding for binary data

Isaac Morland
In reply to this post by Anders Åstrand-2
On Mon, 7 Oct 2019 at 03:15, Anders Åstrand <[hidden email]> wrote:
Hello

Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).

The background for this patch is that I could easily build urls in
plpgsql, but doing the actual encoding of the url parts is painfully
slow. The list of available encodings for encode/decode looks quite
arbitrary to me, so I can't see any reason this one couldn't be in
there.

In modern web scenarios one would probably most likely want to encode
the utf8 representation of a text string for inclusion in a url, in
which case correct invocation would be ENCODE(CONVERT_TO('some text in
database encoding goes here', 'UTF8'), 'uri'), but uri
percent-encoding can of course also be used for other text encodings
and arbitrary binary data.

This seems like a useful idea to me. I've used the equivalent in Python and it provides more options:


I suggest reviewing that documentation there, because there are a few details that need to be checked carefully. Whether or not space should be encoded as plus and whether certain byte values should be exempt from %-encoding is something that depends on the application. Unfortunately, as far as I can tell there isn't a single version of URL encoding that satisfies all situations (thus explaining the complexity of the Python implementation). It might be feasible to suppress some of the Python options (I'm wondering about the safe= parameter) but I'm pretty sure you at least need the equivalent of quote and quote_plus.
Reply | Threaded
Open this post in threaded view
|

Re: PATCH: Add uri percent-encoding for binary data

Anders Åstrand-2
In reply to this post by Bruce Momjian
On Mon, Oct 7, 2019 at 9:52 PM Bruce Momjian <[hidden email]> wrote:

>
> On Mon, Oct  7, 2019 at 09:14:38AM +0200, Anders Åstrand wrote:
> > Hello
> >
> > Attached is a patch for adding uri as an encoding option for
> > encode/decode. It uses what's called "percent-encoding" in rfc3986
> > (https://tools.ietf.org/html/rfc3986#section-2.1).
>
> Oh, that's a cool idea.  Can you add it to the commit-fest?
>
>         https://commitfest.postgresql.org/25/
>
>

Thanks for your reply! I added it but was unsure of what topic was
appropriate and couldn't find a description of them anywhere. I went
with Miscellaneous for now.


Reply | Threaded
Open this post in threaded view
|

Re: PATCH: Add uri percent-encoding for binary data

Anders Åstrand-2
In reply to this post by Isaac Morland
On Mon, Oct 7, 2019 at 11:38 PM Isaac Morland <[hidden email]> wrote:

>
> On Mon, 7 Oct 2019 at 03:15, Anders Åstrand <[hidden email]> wrote:
>>
>> Hello
>>
>> Attached is a patch for adding uri as an encoding option for
>> encode/decode. It uses what's called "percent-encoding" in rfc3986
>> (https://tools.ietf.org/html/rfc3986#section-2.1).
>>
>> The background for this patch is that I could easily build urls in
>> plpgsql, but doing the actual encoding of the url parts is painfully
>> slow. The list of available encodings for encode/decode looks quite
>> arbitrary to me, so I can't see any reason this one couldn't be in
>> there.
>>
>> In modern web scenarios one would probably most likely want to encode
>> the utf8 representation of a text string for inclusion in a url, in
>> which case correct invocation would be ENCODE(CONVERT_TO('some text in
>> database encoding goes here', 'UTF8'), 'uri'), but uri
>> percent-encoding can of course also be used for other text encodings
>> and arbitrary binary data.
>
>
> This seems like a useful idea to me. I've used the equivalent in Python and it provides more options:
>
> https://docs.python.org/3/library/urllib.parse.html#url-quoting
>
> I suggest reviewing that documentation there, because there are a few details that need to be checked carefully. Whether or not space should be encoded as plus and whether certain byte values should be exempt from %-encoding is something that depends on the application. Unfortunately, as far as I can tell there isn't a single version of URL encoding that satisfies all situations (thus explaining the complexity of the Python implementation). It might be feasible to suppress some of the Python options (I'm wondering about the safe= parameter) but I'm pretty sure you at least need the equivalent of quote and quote_plus.

Thanks a lot for your reply!

I agree that some (but not all) of the options available to that
python lib could be helpful for developers wanting to build urls
without having to encode the separate parts of it and stitching it
together, but not necessary for this patch to be useful. For generic
uri encoding the slash (/) must be percent encoded, because it has
special meaning in the standard. Some other extra characters may
appear unencoded though depending on context, but it's generally safer
to just encode them all and not hope that the encoder will know about
the context and skip over certain characters.

This does bring up an interesting point however. Maybe decode should
validate that only characters that are allowed unencoded appear in the
input?

Luckily, the plus-encoding of spaces are not part of the uri standard
at all but instead part of the format referred to as
application/x-www-form-urlencoded data. Fortunately that format is
close to dying now that forms more often post json.

Regards,
Anders