Created
March 7, 2025 18:12
-
-
Save gitronald/66cac42194ea2d489ff3a1e32651e736 to your computer and use it in GitHub Desktop.
WebSearcher: update converter for Canonical Name to UULE parameter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# WebSearcher Locations\n", | |
| "\n", | |
| "Update: 2025-03-07\n", | |
| "- Correction to method for encoding a UULE\n", | |
| "- Realized there was an issue with localization for certain canonical names, namely those for new US congressional districts\n", | |
| " - e.g., `\"Alabama's 1st Congressional District,Alabama,United States\"` worked, but `\"Alabama's 1st Congressional District 2024 redistricting,Alabama,United States\"` did not.\n", | |
| "- The previously used method, which others have posted about, used a fixed prefix of \"w+CAIQICI\" followed by an alphanumeric key (sliced based on the length of the canonical name) and a base64 encoded canonical name.\n", | |
| "- The length of the new congressional district names was a clue, and while looking into the prefix I found [this guide](https://valentin.app/uule.html) which explains the prefix is actually a protocol buffer that has three fields.\n", | |
| "- Although that guide points this out, it doesn't provide the code for encoding/decoding but hints at a protocol buffer used by Google, and it turns out Google has `protobuf` library for Python.\n", | |
| "- Below are the new location encoding/decoding functions and some tests\n", | |
| "- While testing, it also became clear that, for congressional districts, all appear to localize to the state level. e.g., they have a location notice “Results for California” and the map is always the same regardless if using the canonical name for California's 1st (Northeast corner) or 52nd district (Southern edge).\n", | |
| " - This means that the location results for those geolocations may not be coarse-grained to the state-level, though the ads for those searches may still align with ads targeted to that specific district.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "original:\t CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM=\n", | |
| "decoded fields:\t {1: 2, 2: 32, 4: 'West New York,New Jersey,United States'}\n", | |
| "reencoded:\t CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM=\n", | |
| "uule:\t\t w+CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM=\n", | |
| "https://google.com/search?q=pizza&uule=w+CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM=\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import base64\n", | |
| "from google.protobuf.internal import decoder, encoder # poetry add protobuf\n", | |
| "from typing import Dict, Union, Any\n", | |
| "\n", | |
| "def convert_canonical_name_to_uule(canon_name: str) -> str:\n", | |
| " \"\"\"\n", | |
| " Get UULE parameter based on a location's canonical name.\n", | |
| " Args: canon_name: Canonical name of the location\n", | |
| " Returns: UULE parameter for Google search\n", | |
| " \"\"\"\n", | |
| " fields = {1: 2, 2: 32, 4: canon_name}\n", | |
| " encoded_string = encode_protobuf_string(fields)\n", | |
| " return f'w+{encoded_string}'\n", | |
| "\n", | |
| "\n", | |
| "def encode_protobuf_string(fields: Dict[int, Union[str, int]]) -> str:\n", | |
| " \"\"\"\n", | |
| " Encode a dictionary of field numbers and values into a base64-encoded protobuf string.\n", | |
| " Args: fields: A dictionary where keys are protobuf field numbers and values are the data to encode\n", | |
| " Returns: A base64-encoded protobuf message string\n", | |
| " \"\"\"\n", | |
| " encoded = bytearray() # Buffer to store encoded bytes\n", | |
| "\n", | |
| " for field_number, value in fields.items():\n", | |
| " wire_type = 2 if isinstance(value, str) else 0 # Determine wire type based on value type\n", | |
| " tag = field_number << 3 | wire_type # Combine field number and wire type into tag\n", | |
| " encoded.extend(encoder._VarintBytes(tag)) # Encode the tag into bytes\n", | |
| " \n", | |
| " # Encode the value based on wire type\n", | |
| " if wire_type == 0:\n", | |
| " encoded.extend(encoder._VarintBytes(value)) # Encode the integer as varint\n", | |
| " if wire_type == 2:\n", | |
| " value = value.encode('utf-8') # Convert string to bytes\n", | |
| " encoded.extend(encoder._VarintBytes(len(value))) # Add length prefix\n", | |
| " encoded.extend(value) # Add the actual bytes\n", | |
| " \n", | |
| " return base64.b64encode(bytes(encoded)).decode('utf-8') # Convert to base64 and decode to string\n", | |
| "\n", | |
| "\n", | |
| "def decode_protobuf_string(encoded_string: str) -> Dict[int, Any]:\n", | |
| " \"\"\"\n", | |
| " Decode a base64-encoded protobuf string into a dictionary of field numbers and values.\n", | |
| " Args: encoded_string: A base64-encoded protobuf message\n", | |
| " Returns: dictionary where keys are protobuf field numbers and values are the decoded values\n", | |
| " \"\"\"\n", | |
| "\n", | |
| " pos = 0 # Position tracker for decoding\n", | |
| " fields = {} # Dictionary to store decoded field numbers and values\n", | |
| "\n", | |
| " protobuf_bytes = base64.b64decode(encoded_string) # Convert to protobuf bytes\n", | |
| " while pos < len(protobuf_bytes):\n", | |
| "\n", | |
| " # Get field number and wire type\n", | |
| " tag, pos_new = decoder._DecodeVarint(protobuf_bytes, pos) # Each protobuf field starts with a varint tag\n", | |
| " field_number, wire_type = tag >> 3, tag & 7 # Extract field number and wire type from tag\n", | |
| " \n", | |
| " # Decode value based on wire type (0: varint, 2: length-delimited; others not supported)\n", | |
| " if wire_type == 0:\n", | |
| " value, pos_new = decoder._DecodeVarint(protobuf_bytes, pos_new) # Get the varint value and new position\n", | |
| " elif wire_type == 2:\n", | |
| " length, pos_start = decoder._DecodeVarint(protobuf_bytes, pos_new) # Get length and starting position\n", | |
| " value = protobuf_bytes[pos_start:pos_start + length] # Extract data based on the length\n", | |
| " pos_new = pos_start + length # Update the new position\n", | |
| " value = value.decode('utf-8') # Assume UTF-8 encoding for strings\n", | |
| " \n", | |
| " fields[field_number] = value # Store the field number and value in the dictionary\n", | |
| " pos = pos_new # Move to the next field using the updated position\n", | |
| " return fields\n", | |
| "\n", | |
| "# Decode\n", | |
| "canon_name = \"West New York,New Jersey,United States\"\n", | |
| "encoded_string = 'CAIQICImV2VzdCBOZXcgWW9yayxOZXcgSmVyc2V5LFVuaXRlZCBTdGF0ZXM='\n", | |
| "fields = decode_protobuf_string(encoded_string)\n", | |
| "print('original:\\t', encoded_string)\n", | |
| "print('decoded fields:\\t', fields)\n", | |
| "assert fields[1] == 2, \"field 1 should be 2\"\n", | |
| "assert fields[2] == 32, \"field 2 should be 32\"\n", | |
| "assert fields[4] == canon_name, f\"field 4 should be {canon_name}\"\n", | |
| "\n", | |
| "# Reencode\n", | |
| "reencoded_string = encode_protobuf_string(fields)\n", | |
| "print('reencoded:\\t', reencoded_string)\n", | |
| "assert reencoded_string == encoded_string, \"reencoded string should match the original\"\n", | |
| "\n", | |
| "# UULE\n", | |
| "uule = f'w+{reencoded_string}'\n", | |
| "print('uule:\\t\\t', uule)\n", | |
| "assert(uule == convert_canon_name_to_uule(canon_name))\n", | |
| "\n", | |
| "print(f\"https://google.com/search?q=pizza&uule={uule}\")\n" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "searchaudits-KG7gmxJq-py3.12", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.9" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment