gitronald · March 7, 2025 18:12
diff --git a/ws-uule-update.ipynb b/ws-uule-update.ipynb
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# WebSearcher Locations\n",
        "\n",
        "Update: 2025-03-07\n",
        "- Correction to method for encoding a UULE\n",
        "- Realized there was an issue with localization for certain canonical names, namely those for new US congressional districts\n",
        "  - e.g., `\"Alabama's 1st Congressional District,Alabama,United States\"` worked, but `\"Alabama's 1st Congressional District 2024 redistricting,Alabama,United States\"` did not.\n",
        "- The previously used method, which others have posted about, used a fixed prefix of \"w+CAIQICI\" followed by an alphanumeric key (sliced based on the length of the canonical name) and a base64 encoded canonical name.\n",
        "- The length of the new congressional district names was a clue, and while looking into the prefix I found [this guide](https://valentin.app/uule.html) which explains the prefix is actually a protocol buffer that has three fields.\n",
        "- Although that guide points this out, it doesn't provide the code for encoding/decoding but hints at a protocol buffer used by Google, and it turns out Google has `protobuf` library for Python.\n",
        "- Below are the new location encoding/decoding functions and some tests\n",
        "- While testing, it also became clear that, for congressional districts, all appear to localize to the state level. e.g., they have a location notice “Results for California” and the map is always the same regardless if using the canonical name for California's 1st (Northeast corner) or 52nd district (Southern edge).\n",
        "  - This means that the location results for those geolocations may not be coarse-grained to the state-level, though the ads for those searches may still align with ads targeted to that specific district.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "original:\t CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM=\n",
            "decoded fields:\t {1: 2, 2: 32, 4: 'West New York,New Jersey,United States'}\n",
            "reencoded:\t CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM=\n",
            "uule:\t\t w+CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM=\n",
            "https://google.com/search?q=pizza&uule=w+CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM=\n"
          ]
        }
      ],
      "source": [
        "import base64\n",
        "from google.protobuf.internal import decoder, encoder  # poetry add protobuf\n",
        "from typing import Dict, Union, Any\n",
        "\n",
        "def convert_canonical_name_to_uule(canon_name: str) -> str:\n",
        "    \"\"\"\n",
        "    Get UULE parameter based on a location's canonical name.\n",
        "    Args: canon_name: Canonical name of the location\n",
        "    Returns: UULE parameter for Google search\n",
        "    \"\"\"\n",
        "    fields = {1: 2, 2: 32, 4: canon_name}\n",
        "    encoded_string = encode_protobuf_string(fields)\n",
        "    return f'w+{encoded_string}'\n",
        "\n",
        "\n",
        "def encode_protobuf_string(fields: Dict[int, Union[str, int]]) -> str:\n",
        "    \"\"\"\n",
        "    Encode a dictionary of field numbers and values into a base64-encoded protobuf string.\n",
        "    Args: fields: A dictionary where keys are protobuf field numbers and values are the data to encode\n",
        "    Returns: A base64-encoded protobuf message string\n",
        "    \"\"\"\n",
        "    encoded = bytearray()  # Buffer to store encoded bytes\n",
        "\n",
        "    for field_number, value in fields.items():\n",
        "        wire_type = 2 if isinstance(value, str) else 0  # Determine wire type based on value type\n",
        "        tag = field_number << 3 | wire_type             # Combine field number and wire type into tag\n",
        "        encoded.extend(encoder._VarintBytes(tag))       # Encode the tag into bytes\n",
        "        \n",
        "        # Encode the value based on wire type\n",
        "        if wire_type == 0:\n",
        "            encoded.extend(encoder._VarintBytes(value))       # Encode the integer as varint\n",
        "        if wire_type == 2:\n",
        "            value = value.encode('utf-8')                     # Convert string to bytes\n",
        "            encoded.extend(encoder._VarintBytes(len(value)))  # Add length prefix\n",
        "            encoded.extend(value)                             # Add the actual bytes\n",
        "    \n",
        "    return base64.b64encode(bytes(encoded)).decode('utf-8')   # Convert to base64 and decode to string\n",
        "\n",
        "\n",
        "def decode_protobuf_string(encoded_string: str) -> Dict[int, Any]:\n",
        "    \"\"\"\n",
        "    Decode a base64-encoded protobuf string into a dictionary of field numbers and values.\n",
        "    Args: encoded_string: A base64-encoded protobuf message\n",
        "    Returns: dictionary where keys are protobuf field numbers and values are the decoded values\n",
        "    \"\"\"\n",
        "\n",
        "    pos = 0       # Position tracker for decoding\n",
        "    fields = {}   # Dictionary to store decoded field numbers and values\n",
        "\n",
        "    protobuf_bytes = base64.b64decode(encoded_string) # Convert to protobuf bytes\n",
        "    while pos < len(protobuf_bytes):\n",
        "\n",
        "        # Get field number and wire type\n",
        "        tag, pos_new = decoder._DecodeVarint(protobuf_bytes, pos) # Each protobuf field starts with a varint tag\n",
        "        field_number, wire_type = tag >> 3, tag & 7               # Extract field number and wire type from tag\n",
        "        \n",
        "        # Decode value based on wire type (0: varint, 2: length-delimited; others not supported)\n",
        "        if wire_type == 0:\n",
        "            value, pos_new = decoder._DecodeVarint(protobuf_bytes, pos_new)    # Get the varint value and new position\n",
        "        elif wire_type == 2:\n",
        "            length, pos_start = decoder._DecodeVarint(protobuf_bytes, pos_new) # Get length and starting position\n",
        "            value = protobuf_bytes[pos_start:pos_start + length]               # Extract data based on the length\n",
        "            pos_new = pos_start + length                                       # Update the new position\n",
        "            value = value.decode('utf-8')                                      # Assume UTF-8 encoding for strings\n",
        "        \n",
        "        fields[field_number] = value    # Store the field number and value in the dictionary\n",
        "        pos = pos_new                   # Move to the next field using the updated position\n",
        "    return fields\n",
        "\n",
        "# Decode\n",
        "canon_name = \"West New York,New Jersey,United States\"\n",
        "encoded_string = 'CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM='\n",
        "fields = decode_protobuf_string(encoded_string)\n",
        "print('original:\\t', encoded_string)\n",
        "print('decoded fields:\\t', fields)\n",
        "assert fields[1] == 2, \"field 1 should be 2\"\n",
        "assert fields[2] == 32, \"field 2 should be 32\"\n",
        "assert fields[4] == canon_name, f\"field 4 should be {canon_name}\"\n",
        "\n",
        "# Reencode\n",
        "reencoded_string = encode_protobuf_string(fields)\n",
        "print('reencoded:\\t', reencoded_string)\n",
        "assert reencoded_string == encoded_string, \"reencoded string should match the original\"\n",
        "\n",
        "# UULE\n",
        "uule = f'w+{reencoded_string}'\n",
        "print('uule:\\t\\t', uule)\n",
        "assert(uule == convert_canon_name_to_uule(canon_name))\n",
        "\n",
        "print(f\"https://google.com/search?q=pizza&uule={uule}\")\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "searchaudits-KG7gmxJq-py3.12",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# WebSearcher Locations\n",
	"\n",
	"Update: 2025-03-07\n",
	"- Correction to method for encoding a UULE\n",
	"- Realized there was an issue with localization for certain canonical names, namely those for new US congressional districts\n",
	" - e.g., `\"Alabama's 1st Congressional District,Alabama,United States\"` worked, but `\"Alabama's 1st Congressional District 2024 redistricting,Alabama,United States\"` did not.\n",
	"- The previously used method, which others have posted about, used a fixed prefix of \"w+CAIQICI\" followed by an alphanumeric key (sliced based on the length of the canonical name) and a base64 encoded canonical name.\n",
	"- The length of the new congressional district names was a clue, and while looking into the prefix I found [this guide](https://valentin.app/uule.html) which explains the prefix is actually a protocol buffer that has three fields.\n",
	"- Although that guide points this out, it doesn't provide the code for encoding/decoding but hints at a protocol buffer used by Google, and it turns out Google has `protobuf` library for Python.\n",
	"- Below are the new location encoding/decoding functions and some tests\n",
	"- While testing, it also became clear that, for congressional districts, all appear to localize to the state level. e.g., they have a location notice “Results for California” and the map is always the same regardless if using the canonical name for California's 1st (Northeast corner) or 52nd district (Southern edge).\n",
	" - This means that the location results for those geolocations may not be coarse-grained to the state-level, though the ads for those searches may still align with ads targeted to that specific district.\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"original:\t CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM=\n",
	"decoded fields:\t {1: 2, 2: 32, 4: 'West New York,New Jersey,United States'}\n",
	"reencoded:\t CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM=\n",
	"uule:\t\t w+CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM=\n",
	"https://google.com/search?q=pizza&uule=w+CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM=\n"
	]
	}
	],
	"source": [
	"import base64\n",
	"from google.protobuf.internal import decoder, encoder # poetry add protobuf\n",
	"from typing import Dict, Union, Any\n",
	"\n",
	"def convert_canonical_name_to_uule(canon_name: str) -> str:\n",
	" \"\"\"\n",
	" Get UULE parameter based on a location's canonical name.\n",
	" Args: canon_name: Canonical name of the location\n",
	" Returns: UULE parameter for Google search\n",
	" \"\"\"\n",
	" fields = {1: 2, 2: 32, 4: canon_name}\n",
	" encoded_string = encode_protobuf_string(fields)\n",
	" return f'w+{encoded_string}'\n",
	"\n",
	"\n",
	"def encode_protobuf_string(fields: Dict[int, Union[str, int]]) -> str:\n",
	" \"\"\"\n",
	" Encode a dictionary of field numbers and values into a base64-encoded protobuf string.\n",
	" Args: fields: A dictionary where keys are protobuf field numbers and values are the data to encode\n",
	" Returns: A base64-encoded protobuf message string\n",
	" \"\"\"\n",
	" encoded = bytearray() # Buffer to store encoded bytes\n",
	"\n",
	" for field_number, value in fields.items():\n",
	" wire_type = 2 if isinstance(value, str) else 0 # Determine wire type based on value type\n",
	" tag = field_number << 3 \| wire_type # Combine field number and wire type into tag\n",
	" encoded.extend(encoder._VarintBytes(tag)) # Encode the tag into bytes\n",
	" \n",
	" # Encode the value based on wire type\n",
	" if wire_type == 0:\n",
	" encoded.extend(encoder._VarintBytes(value)) # Encode the integer as varint\n",
	" if wire_type == 2:\n",
	" value = value.encode('utf-8') # Convert string to bytes\n",
	" encoded.extend(encoder._VarintBytes(len(value))) # Add length prefix\n",
	" encoded.extend(value) # Add the actual bytes\n",
	" \n",
	" return base64.b64encode(bytes(encoded)).decode('utf-8') # Convert to base64 and decode to string\n",
	"\n",
	"\n",
	"def decode_protobuf_string(encoded_string: str) -> Dict[int, Any]:\n",
	" \"\"\"\n",
	" Decode a base64-encoded protobuf string into a dictionary of field numbers and values.\n",
	" Args: encoded_string: A base64-encoded protobuf message\n",
	" Returns: dictionary where keys are protobuf field numbers and values are the decoded values\n",
	" \"\"\"\n",
	"\n",
	" pos = 0 # Position tracker for decoding\n",
	" fields = {} # Dictionary to store decoded field numbers and values\n",
	"\n",
	" protobuf_bytes = base64.b64decode(encoded_string) # Convert to protobuf bytes\n",
	" while pos < len(protobuf_bytes):\n",
	"\n",
	" # Get field number and wire type\n",
	" tag, pos_new = decoder._DecodeVarint(protobuf_bytes, pos) # Each protobuf field starts with a varint tag\n",
	" field_number, wire_type = tag >> 3, tag & 7 # Extract field number and wire type from tag\n",
	" \n",
	" # Decode value based on wire type (0: varint, 2: length-delimited; others not supported)\n",
	" if wire_type == 0:\n",
	" value, pos_new = decoder._DecodeVarint(protobuf_bytes, pos_new) # Get the varint value and new position\n",
	" elif wire_type == 2:\n",
	" length, pos_start = decoder._DecodeVarint(protobuf_bytes, pos_new) # Get length and starting position\n",
	" value = protobuf_bytes[pos_start:pos_start + length] # Extract data based on the length\n",
	" pos_new = pos_start + length # Update the new position\n",
	" value = value.decode('utf-8') # Assume UTF-8 encoding for strings\n",
	" \n",
	" fields[field_number] = value # Store the field number and value in the dictionary\n",
	" pos = pos_new # Move to the next field using the updated position\n",
	" return fields\n",
	"\n",
	"# Decode\n",
	"canon_name = \"West New York,New Jersey,United States\"\n",
	"encoded_string = 'CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM='\n",
	"fields = decode_protobuf_string(encoded_string)\n",
	"print('original:\\t', encoded_string)\n",
	"print('decoded fields:\\t', fields)\n",
	"assert fields[1] == 2, \"field 1 should be 2\"\n",
	"assert fields[2] == 32, \"field 2 should be 32\"\n",
	"assert fields[4] == canon_name, f\"field 4 should be {canon_name}\"\n",
	"\n",
	"# Reencode\n",
	"reencoded_string = encode_protobuf_string(fields)\n",
	"print('reencoded:\\t', reencoded_string)\n",
	"assert reencoded_string == encoded_string, \"reencoded string should match the original\"\n",
	"\n",
	"# UULE\n",
	"uule = f'w+{reencoded_string}'\n",
	"print('uule:\\t\\t', uule)\n",
	"assert(uule == convert_canon_name_to_uule(canon_name))\n",
	"\n",
	"print(f\"https://google.com/search?q=pizza&uule={uule}\")\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "searchaudits-KG7gmxJq-py3.12",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}
No results found