Skip to content

Instantly share code, notes, and snippets.

@kristiyanto
kristiyanto / matcher.py
Created July 3, 2024 15:25
NLP: NER using rule based matcher - Medium Article
# Refer to the Jupyter Notebook and article for package imports and the complete code.
def extract_keywords(text, max_keywords=10):
doc = nlp(text)
matcher = Matcher(nlp.vocab)
# Noun and Noun Phrases
noun_phrases_patterns = [
[{'POS': 'NUM'}, {'POS': 'NOUN'}], #example: 2 bedrooms
[{'POS': 'ADJ', 'OP': '*'}, {'POS': 'NOUN'}], #example: beautiful house
@kristiyanto
kristiyanto / t5.py
Created July 3, 2024 14:41
NLP: LLM for text summarization Medium Article
# Refer to the Jupyter Notebook and article for package imports and the complete code.
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name)
def summarize_with_t5(text, max_length=80):
if len(text) < max_length:
return text
@kristiyanto
kristiyanto / sent_score.py
Created July 3, 2024 14:34
NLP: Text Summarization for Medium Article
# Refer to the Jupyter Notebook and article for package imports and the complete code.
def summarize(text, char_limit=80):
doc = nlp(text.description)
sentences = [sent.text.strip() for sent in doc.sents]
keywords = text.keywords
if not keywords or not sentences:
return ""
@kristiyanto
kristiyanto / custom_lemma.py
Created July 3, 2024 14:30
NLP: Custom Lemma for Medium Article
# Refer to the Jupyter Notebook and article for package imports and the complete code.
nlp = spacy.load("en_core_web_sm")
@Language.component("custom_lemma_component")
def custom_lemma_component(doc):
custom_lemmas = {
"br": "bedroom",
"apt": "apartment",
"st": "street",
"min": "minute",
@kristiyanto
kristiyanto / logger.ts
Created June 23, 2024 09:10
logging_with_winston_medium_2024
/**
* Transmit logs to Google Cloud Logging
*
* Setup Instructions:
* 1. Generate and Install Google Service Account Credentials:
* - Ensure the Service Account has Log Writer access.
* - Follow the instructions here: https://developers.google.com/workspace/guides/create-credentials
*
* 2. Configure env variables:
* When enabled, Logs will be transmitted to both the console and Google Cloud Logging.
heartbeat = transaction.select(f.max('created_date')).collect()[0][0] - pd.Timedelta('60 days')
all_users = (transaction.filter(f.col('created_date')>f.lit(heartbeat))
.select('user_id', f.concat_ws('|', 'birth_year', 'device').alias('stratum'), 'home_city')
.distinct()
)
fractions = all_users.select('stratum').distinct().withColumn('frac', lit(0.5)).rdd.collectAsMap()
group_A = all_users.sampleBy('stratum', fractions, 555)
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import IndexToString, StringIndexer
# Model
als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True, seed=777, nonnegative=True, coldStartStrategy='drop',
userCol='user_index', itemCol='context_index', ratingCol='count')
rmse = RegressionEvaluator(metricName='rmse', labelCol='count',
predictionCol='prediction')
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import PowerTransformer, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, plot_roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.dummy import DummyClassifier
import warnings
from fintech.features import activity, demographic, location, recency
from pyspark.ml import Pipeline
def generateFeatures(transaction, users):
feature_list = [
# HISTORICAL ACTIVITIES
activity.f_user_transcAmount_min(transc_table=transaction),
activity.f_user_transcAmount_max(transc_table=transaction),
activity.f_user_transcAmount_total(transc_table=transaction),
@kristiyanto
kristiyanto / fintech_engagement_feature_module.py
Created May 28, 2021 07:50
Fintech Engagement: feature computation
class f_user_daysSinceLastTransc(Transformer):
''' Feature description: Number of days since last transaction.
'''
@keyword_only
def __init__(self, transc_table, **kwargs):
super(f_user_daysSinceLastTransc, self).__init__()
self.transc_table = transc_table
self.feature_names = "user_daysSinceLastTransc"