Created
March 13, 2023 02:25
-
-
Save aerovulpe/0bdb9a7e645410c9d37bc11d803de25a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * © ChatKitty, Inc. 2022 - All Rights Reserved. | |
| * Unauthorized copying of this file, via any medium is strictly prohibited. | |
| * This source code and all its derivatives are proprietary and confidential. | |
| */ | |
| package com.chatkitty.domain.infrastructure.opengraph | |
| import org.htmlcleaner.HtmlCleaner | |
| import java.io.BufferedReader | |
| import java.io.InputStreamReader | |
| import java.net.URL | |
| import java.net.URLConnection | |
| import java.nio.charset.Charset | |
| import java.util.Hashtable | |
| import java.util.regex.Pattern | |
| class OpenGraphImpl( | |
| url: String, | |
| ignoreSpecErrors: Boolean = true | |
| ) : OpenGraph { | |
| companion object { | |
| val REQUIRED_META = arrayOf("title", "type", "image", "url") | |
| val BASE_TYPES = Hashtable<String, Array<String>>() | |
| init { | |
| BASE_TYPES["activity"] = arrayOf("activity", "sport") | |
| BASE_TYPES["business"] = | |
| arrayOf("bar", "company", "cafe", "hotel", "restaurant") | |
| BASE_TYPES["group"] = | |
| arrayOf("cause", "sports_league", "sports_team") | |
| BASE_TYPES["organization"] = | |
| arrayOf("band", "government", "non_profit", "school", "university") | |
| BASE_TYPES["person"] = arrayOf( | |
| "actor", | |
| "athlete", | |
| "author", | |
| "director", | |
| "musician", | |
| "politician", | |
| "profile", | |
| "public_figure" | |
| ) | |
| BASE_TYPES["place"] = arrayOf("city", "country", "landmark", "state_province") | |
| BASE_TYPES["product"] = arrayOf( | |
| "album", | |
| "book", | |
| "drink", | |
| "food", | |
| "game", | |
| "movie", | |
| "product", | |
| "song", | |
| "tv_show" | |
| ) | |
| BASE_TYPES["website"] = arrayOf("blog", "website", "article") | |
| } | |
| private fun getConnectionCharset(connection: URLConnection): Charset { | |
| return try { | |
| var contentType = connection.contentType | |
| if (contentType != null && contentType.isNotEmpty()) { | |
| contentType = contentType.lowercase() | |
| val charsetName = extractCharsetName(contentType) | |
| if (charsetName != null && charsetName.isNotEmpty()) { | |
| return Charset.forName(charsetName) | |
| } | |
| } | |
| Charset.defaultCharset() | |
| } catch (e: Exception) { | |
| Charset.defaultCharset() | |
| } | |
| } | |
| private fun extractCharsetName(contentType: String): String? { | |
| val mediaTypes = contentType.split(":".toRegex()).toTypedArray() | |
| if (mediaTypes.isNotEmpty()) { | |
| val params = mediaTypes[0].split(";".toRegex()).toTypedArray() | |
| for (each in params) { | |
| val trimmed = each.trim { it <= ' ' } | |
| if (trimmed.startsWith("charset=")) { | |
| return trimmed.substring(8).trim { it <= ' ' } | |
| } | |
| } | |
| } | |
| return null | |
| } | |
| } | |
| var baseType: String? | |
| private val originalUrl: String | |
| private val pageNamespaces: MutableList<OpenGraphNamespace> = mutableListOf() | |
| private val metaAttributes: MutableMap<String, MutableList<MetaElement>> = mutableMapOf() | |
| init { | |
| val pageURL = URL(url) | |
| val connection = pageURL.openConnection() | |
| .apply { | |
| setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36") | |
| setRequestProperty("Accept", "*/*") | |
| } | |
| val charset = getConnectionCharset(connection) | |
| val dis = BufferedReader(InputStreamReader(connection.getInputStream(), charset)) | |
| val headContents = StringBuffer() | |
| var inputLine: String | |
| while (dis.readLine().also { inputLine = it } != null) { | |
| if (inputLine.contains("</head>")) { | |
| inputLine = inputLine.substring(0, inputLine.indexOf("</head>") + 7) | |
| inputLine = "$inputLine<body></body></html>" | |
| headContents.append( | |
| """ | |
| $inputLine | |
| """.trimIndent() | |
| ) | |
| break | |
| } | |
| headContents.append( | |
| """ | |
| $inputLine | |
| """.trimIndent() | |
| ) | |
| } | |
| val headContentsStr = headContents.toString() | |
| val cleaner = HtmlCleaner() | |
| val pageData = cleaner.clean(headContentsStr) | |
| var hasOgSpec = false | |
| val headElement = pageData.findElementByName("head", true) | |
| if (headElement.hasAttribute("prefix")) { | |
| val namespaceData = headElement.getAttributeByName("prefix") | |
| val pattern = Pattern.compile("(([A-Za-z0-9_]+):\\s+(http:\\/\\/ogp.me\\/ns(\\/\\w+)*#))\\s*") | |
| val matcher = pattern.matcher(namespaceData) | |
| while (matcher.find()) { | |
| val prefix = matcher.group(2) | |
| val documentURI = matcher.group(3) | |
| pageNamespaces.add(OpenGraphNamespace(prefix, documentURI)) | |
| if (prefix == "og") hasOgSpec = true | |
| } | |
| } | |
| if (!hasOgSpec) pageNamespaces.add(OpenGraphNamespace("og", "http:// ogp.me/ns#")) | |
| val metaData = pageData.getElementsByName("meta", true) | |
| for (metaElement in metaData) { | |
| for (namespace in pageNamespaces) { | |
| var target: String? = null | |
| if (metaElement.hasAttribute("property")) target = | |
| "property" else if (metaElement.hasAttribute("name")) target = "name" | |
| if (target != null && metaElement.getAttributeByName(target) | |
| .startsWith(namespace.prefix + ":") | |
| ) { | |
| setProperty( | |
| namespace, | |
| metaElement.getAttributeByName(target), | |
| metaElement.getAttributeByName("content") | |
| ) | |
| break | |
| } | |
| } | |
| } | |
| if (!ignoreSpecErrors) { | |
| for (req in REQUIRED_META) { | |
| if (!metaAttributes.containsKey(req)) throw Exception("Does not conform to Open Graph protocol") | |
| } | |
| } | |
| baseType = null | |
| var currentType = getContent("type") | |
| if (currentType != null) { | |
| for ((prefix) in pageNamespaces) { | |
| if (currentType!!.startsWith("$prefix:")) { | |
| currentType = currentType.replaceFirst(prefix + ":".toRegex(), "") | |
| break | |
| } | |
| } | |
| } | |
| for (base in BASE_TYPES.keys) { | |
| val baseList = BASE_TYPES[base]!! | |
| var finished = false | |
| for (expandedType in baseList) { | |
| if (expandedType == currentType) { | |
| baseType = base | |
| finished = true | |
| break | |
| } | |
| } | |
| if (finished) break | |
| } | |
| val realURL = connection.url | |
| originalUrl = realURL.toExternalForm() | |
| } | |
| override fun getContent(property: String): String? = | |
| if (metaAttributes.containsKey(property) && metaAttributes[property]!!.size > 0) metaAttributes[property]!![0].content else null | |
| override fun setProperty(namespace: OpenGraphNamespace, property: String, content: String?) { | |
| var sanitizedProperty = property | |
| if (!pageNamespaces.contains(namespace)) pageNamespaces.add(namespace) | |
| sanitizedProperty = sanitizedProperty.replace(namespace.prefix + ":".toRegex(), "") | |
| val element = MetaElement(namespace, sanitizedProperty, content!!) | |
| if (!metaAttributes.containsKey(sanitizedProperty)) metaAttributes[sanitizedProperty] = mutableListOf() | |
| metaAttributes[sanitizedProperty]!!.add(element) | |
| } | |
| override fun getProperties(property: String): List<MetaElement>? = | |
| if (metaAttributes.containsKey(property)) { | |
| metaAttributes[property]!! | |
| } else null | |
| override fun removeProperty(property: String) { | |
| metaAttributes.remove(property) | |
| } | |
| override fun toHtml(): List<String> { | |
| val html = mutableListOf<String>() | |
| for (elements in metaAttributes.values) { | |
| for ((namespace, property, content) in elements) html.add( | |
| "<meta property=\"" + namespace + ":" + | |
| property + "\" content=\"" + content + "\" />" | |
| ) | |
| } | |
| return html | |
| } | |
| override fun toXhtml(): List<String> { | |
| val html = mutableListOf<String>() | |
| for (elements in metaAttributes.values) { | |
| for ((namespace, property, content) in elements) html.add( | |
| "<meta name=\"" + namespace.prefix + ":" + | |
| property + "\" content=\"" + content + "\" />" | |
| ) | |
| } | |
| return html | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment