kinoc · June 21, 2021 10:54 · Jun 21, 2021
diff --git a/jserv_hf_fast.py b/jserv_hf_fast.py
@@ -0,0 +1,570 @@
+
+# So you want to run GPT-J-6B using HuggingFace+FastAPI on a local rig (3090 or TITAN) ... tricky.
+# special help from the Kolob Colab server https://colab.research.google.com/drive/1VFh5DOkCJjWIrQ6eB82lxGKKPgXmsO5D?usp=sharing#scrollTo=iCHgJvfL4alW
+# Conversion to HF format (12.6GB tar image) found at https://drive.google.com/u/0/uc?id=1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1&export=download
+# Uses GDOWN to get the image
+# You will need 26 GB of space, 12+GB for the tar and 12+GB expanded (you can nuke the tar after expansion)
+
+# Near Simplest Language model API, with room to expand!
+# runs GPT-J-6B on 3090 and TITAN and servers it using FastAPI
+# change "seq" (which is the context size) to adjust footprint
+#
+# JAX-based
+# seq   vram usage
+# 512   14.7G
+# 900   15.3G
+#
+# HF-based
+# seq   vram usage
+# 512   15.6 G
+# 900   --.- G
+#
+
+# uses FastAPI, so install that
+# https://fastapi.tiangolo.com/tutorial/
+#   pip install fastapi
+#   pip install uvicorn[standard]
+#   pip install git+https://github.com/finetuneanon/transformers@gpt-neo-localattention3
+#   pip install termcolor
+#   #`pip install flask-ngrok
+#   #`pip install flask_cloudflared
+#   pip install pyngrok
+#   pip install nest-asyncio
+
+#   pip install gdown
+#   gdown --id 1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1 --output ../j6b_ckpt.tar
+#   (resutls 12.6GB [18:19], 11.4MB/s]
+#
+# note: for my setup I needed to perform symlink suggested ny myjr52 in https://github.com/google/jax/issues/5231
+# https://pytorch.org/get-started/previous-versions/
+# for cuda 10.1
+# pip install torch==1.8.1+cu101 torchvision==0.9.1+cu101 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
+# for cuda 11.2
+# pip install torch==1.8.1+cu112 torchvision==0.9.1+cu112 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
+
+# conda install python-multipart 
+
+#--------------------------------------
+#chek pyngrok â€” https://github.com/alexdlaird/pyngrok
+#install 
+#   pip install pyngrok
+#
+#    Set up your ngrok Authtoken
+# ngrok authtoken xxxxxxxxxxxxx
+
+# GO: local execution
+# XLA_PYTHON_CLIENT_PREALLOCATE=false XLA_PYTHON_CLIENT_ALLOCATOR=platform CUDA_VISIBLE_DEVICES=0 python3 jserv_hf_fast.py
+
+# When done try
+# http://localhost:8051/docs#/default/read_completions_engines_completions_post
+
+# now you are in FastAPI + EleutherAI land
+# note: needs async on the read_completions otherwise jax gets upset
+# REMEMBER: adjust the location of the checkpoint image TAR_PATH
+
+#
+
+# Using plain HF instead of Jax so can comment out JAX related for this install
+# -----------------------------------------
+# # uses https://github.com/kingoflolz/mesh-transformer-jax
+
+# # so install jax on your system so recommend you get it working with your GPU first
+# # !apt install zstd
+#
+# #
+# # the "slim" version contain only bf16 weights and no optimizer parameters, which minimizes bandwidth and memory
+# # wget https://the-eye.eu/public/AI/GPT-J-6B/step_383500_slim.tar.zstd
+
+# # tar -I zstd -xf step_383500_slim.tar.zstd
+
+# # git clone https://github.com/kingoflolz/mesh-transformer-jax.git
+# # pip install -r mesh-transformer-jax/requirements.txt
+
+# # jax 0.2.12 is required due to a regression with xmap in 0.2.13
+# # pip install mesh-transformer-jax/ jax==0.2.12
+# # I have cuda 10.1 and python 3.9 so had to update
+# # pip3 install --upgrade "https://storage.googleapis.com/jax-releases/cuda101/jaxlib-0.1.66+cuda101-cp39-none-manylinux2010_x86_64.whl"
+
+# -----------------------------------------
+#
+# Started 2021-06-19 (USA Juneteenth) and released to freedom under MIT 
+#
+
+
+from termcolor import colored
+
+#from flask import Flask, redirect, url_for, request
+import json
+import torch
+import requests
+import subprocess
+import tarfile
+import os
+import re
+import time
+from threading import Timer
+
+
+
+from typing import Optional
+from typing import Dict
+from fastapi import FastAPI,Request,Body
+import uvicorn
+import nest_asyncio
+from pyngrok import ngrok
+
+
+
+import threading 
+import numpy as np
+import transformers
+
+from transformers import GPTNeoForCausalLM, AutoConfig,AutoTokenizer,GPT2Tokenizer
+
+
+
+print(colored("Server Initialization ...", "magenta"))
+connect_method = "Ngrok" #@param ["Ngrok", "Cloudflare"]
+
+#if connect_method == "Cloudflare":
+#   from flask_cloudflared import run_with_cloudflared
+#elif connect_method == "Ngrok":
+#   from flask_ngrok import run_with_ngrok
+
+model         = None
+tokenizer     = None
+
+
+#------------------------------------------
+# REMEMBER: Change these settings to local values
+
+active_model=''
+runtime_gpu="cuda:0"
+training_gpu="cuda:0"
+
+TAR_PATH ="../"
+check_point_dir="../j6b_ckpt"
+SERVER_PORT = 9995
+NGROK_AUTH_TOKEN ="xxxxxxxxx"
+
+#-----------------------------------------
+#https://stackoverflow.com/questions/48152674/how-to-check-if-pytorch-is-using-the-gpu
+report_color ="green"
+if (not torch.cuda.is_available()): report_color="red"
+
+print(colored("   torch.cuda.is_available() = "+str(torch.cuda.is_available()), report_color))
+print(colored("   torch.cuda.current_device() = "+str(torch.cuda.current_device()), report_color))
+print(colored("   torch.cuda.device_count() = "+str(torch.cuda.device_count()), report_color))
+print(colored("   torch.cuda.get_device_name(0) = "+str(torch.cuda.get_device_name()), report_color))
+print(colored("   Mem Allocated:{}GB".format(round(torch.cuda.memory_allocated(0)/1024**3,1)), report_color))
+print(colored("   Mem Cached: {}GB".format(round(torch.cuda.memory_reserved(0)/1024**3,1)), report_color))
+
+# Set path to tar file and unpack it
+model_on_drive = TAR_PATH +"j6b_ckpt.tar"
+print(colored("Checking j6b_ckpt ...", "magenta"))
+print(colored("   TAR_PATH ={}".format(TAR_PATH),"green"))
+print(colored("   check_point_dir ={}".format(check_point_dir),"green"))
+print(colored("   model_on_drive ={}".format(model_on_drive),"green"))
+
+if (not os.path.isdir(check_point_dir)):
+    print(colored("Unpacking tar file, please wait...", "magenta"))
+    tar = tarfile.open(model_on_drive, "r")
+    tar.extractall()
+    tar.close()
+
+else:
+    print( colored("Expanded Checkpoint directory found", "green") ) 
+
+# Initialize the model
+print(colored("Initializing model, please wait...", "magenta"))
+config = AutoConfig.from_pretrained("EleutherAI/gpt-neo-2.7B")
+config.attention_layers = ["global"] * 28
+config.attention_types = [["global"], 28]
+config.num_layers = 28
+config.num_heads = 16
+config.hidden_size = 256 * config.num_heads
+config.vocab_size = 50400
+config.rotary = True
+config.rotary_dim = 64
+config.jax = True
+
+try:
+    from collections.abc import MutableMapping
+except ImportError:
+    from collections import MutableMapping
+from pathlib import Path
+
+class Checkpoint(MutableMapping):
+    def __init__(self, chkpt_dir, device="cpu"):
+        self.device = device
+        self.chkpt_dir = Path(chkpt_dir)
+        self.checkpoint = torch.load(str(chkpt_dir / Path("m.pt")))
+    def __len__(self):
+        return len(self.checkpoint)
+    def __getitem__(self, key):
+        path = self.chkpt_dir / Path(self.checkpoint[key]).name
+        return torch.load(str(path), map_location=self.device)
+    def __setitem__(self, key, value):
+        return
+    def __delitem__(self, key, value):
+        return
+    def keys(self):
+        return self.checkpoint.keys()
+    def __iter__(self):
+        for key in self.checkpoint:
+            yield (key, self.__getitem__(key))
+    def __copy__(self):
+        return Checkpoint(self.chkpt_dir, device=self.device)
+    def copy(self):
+        return Checkpoint(self.chkpt_dir, device=self.device)
+
+def infer(context, top_k=40, top_p=0.9, temp=1.0, gen_len=512,repetition_penalty=1):
+	start = time.time()
+	tokens = tokenizer(context, return_tensors="pt").input_ids
+	ids = tokens.cuda()
+
+	start = time.time()
+	#output = network.generate(batched_tokens, length, gen_len, {"top_p": np.ones(total_batch) * top_p, "temp": np.ones(total_batch) * temp})
+	output = model.generate(ids,
+			do_sample=True,
+			min_length=gen_len,
+			max_length=gen_len,
+			temperature=temp,
+			use_cache=True,
+			top_p= top_p,
+			repetition_penalty =1.5,
+			no_repeat_ngram_size=6,
+			max_time=60
+			)
+
+	samples = []
+	for i,out_seq in enumerate(output):
+		samples.append(tokenizer.decode(out_seq, skip_special_tokens=True))
+
+	#for o in decoded_tokens[:, :, 0]:
+	#	samples.append(tokenizer.decode(o))
+
+	print(colored(f"completion done in {time.time() - start:06}s","green"))
+	return samples
+
+def recursive_infer(initial_context, current_context=None, top_k=40, top_p=0.9, temp=1.0, gen_len=256, depth=0, max_depth=5,recursive_refresh=0,repetition_penalty=1):
+  lcc=0
+  ic = initial_context
+  cc = ''
+  if current_context : 
+    lcc = len(current_context)
+    cc = current_context
+  print (colored("ENTER recursive_infer:{} {} {} {}".format(len(initial_context),lcc,depth,max_depth),"red"))
+  print (colored("    in_cc:{}".format(cc),"cyan"))
+
+  c=''
+  if not current_context :
+    c = initial_context
+  else:
+    if (recursive_refresh == 1):
+      c= initial_context + "\r\n ... \r\n"
+    c = c + current_context
+
+  print (colored("loc_c:{}".format(c),"yellow"))
+  loc_len = gen_len + (len(c) / 3)
+  i = infer( c, top_k, top_p, temp, gen_len,repetition_penalty)[0]
+  #yield i[len(c):]
+  #yield i
+  loc_ans = i[len(c):]
+  print (colored("    loc_i:{}".format(i),"white"))
+  print (colored("    loc_ans:{}".format(loc_ans),"white"))
+  if depth >= max_depth: return ''
+  #yield from recursive_infer(initial_context, i[len(c):],top_k, top_p, temp, gen_len, depth+1, max_depth,recursive_refresh,repetition_penalty)
+  recursive_ans = recursive_infer(initial_context, str(loc_ans),top_k, top_p, temp, gen_len, depth+1, max_depth,recursive_refresh,repetition_penalty)
+  returned_ans =  str(loc_ans +' '+ recursive_ans)
+  print (colored("    returned_ans:{}".format(returned_ans),"cyan"))
+  print (colored("EXIT recursive_infer:{} {} {} {}".format(len(initial_context),lcc,depth,max_depth),"red"))
+  return returned_ans
+
+#model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint())
+print(colored("loading GPTNeoForCausalLM.from_pretrained","magenta"))
+print(colored("   loading from {}".format(check_point_dir),"green"))
+model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint(check_point_dir))
+print(colored("loading GPT2Tokenizer.from_pretrained","magenta"))
+#tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
+
+
+# Initialize the tokenizer and set up the bad_words_ids to exclude Author's Note tags
+tokenizer     = AutoTokenizer.from_pretrained("gpt2")
+vocab         = tokenizer.get_vocab()
+vocab_keys    = vocab.keys()
+find_keys     = lambda char : [key for key in vocab_keys if key.find(char) != -1]
+bad_words     = []
+bad_words_ids = []
+
+bad_words.extend(find_keys("["))
+bad_words.extend(find_keys(" ["))
+bad_words.extend(find_keys("<|endoftext|>"))
+for key in bad_words:
+  bad_id = vocab[key]
+  bad_words_ids.append([bad_id])
+
+print(colored("    move to GPU","magenta"))
+model.to(runtime_gpu)
+
+print(colored(" >>>> DONE! <<<<", "green"))
+
+print(colored("PRETEST: warming up processing pipeline","magenta"))
+
+#warms up the processing on startup
+pre_prompt = "I am the EleutherAI / GPT-J-6B based AI language model server. I will"
+print (colored("PROMPT:"+pre_prompt,"yellow"))
+print(colored(infer(pre_prompt)[0],"cyan"))
+
+# app = Flask(__name__)
+app = FastAPI()
+
+#if connect_method == "Cloudflare":
+#   run_with_cloudflared(app)
+#elif connect_method == "Ngrok":
+#   run_with_ngrok(app)
+
+@app.route("/")
+def home():
+    return "<h1>EleutherAI J6B Service Running!</h1>"
+
+
+@app.route('/request',methods = ['POST'])
+def koboldrequest(request: Request=None):
+   if request.method == 'POST':
+      try:
+        #clear_output()
+        js      = request.json
+        txt     = js["text"]
+        min     = js["min"]
+        max     = js["max"]
+        rep_pen = js["rep_pen"]
+        temp    = js["temperature"]
+        top_p   = js["top_p"]
+
+        # Compatability with un-updated clients
+        if("numseqs" in js):
+          numseqs = js["numseqs"]
+        else:
+          numseqs = 1
+
+        if("retfultxt" in js):
+          retfultxt = js["retfultxt"]
+        else:
+          retfultxt = True
+
+        print(colored("Received Data: {0}".format(txt), "yellow"))
+
+        torch.cuda.empty_cache()
+        print(colored("Generating text, please wait...", "green"))
+
+        tokens = tokenizer(txt, return_tensors="pt").input_ids.to("cpu")
+        ids = tokens.cuda()
+
+        gen_tokens = model.generate(
+              ids.long().cuda(),
+              do_sample=True,
+              min_length=min,
+              max_length=max,
+              temperature=temp,
+              top_p = top_p,
+              repetition_penalty = rep_pen,
+              use_cache=True,
+              bad_words_ids=bad_words_ids,
+              num_return_sequences=numseqs
+          ).long()
+
+        genout = []
+        for tkns in gen_tokens:
+          if(not retfultxt):
+            # Strip context tokens out of returned sequences
+            dif = (len(tkns) - len(tokens[0])) * -1
+            tkns = tkns[dif:]
+          tkns = list(filter(lambda a: a != 50256, tkns))
+          genout.append(tokenizer.decode(tkns))
+        torch.cuda.empty_cache()
+
+        if(len(genout) > 0 and genout[0] != ""):
+          if(retfultxt):
+            # Outdated client, send old JSON format
+            print(colored("Generated Text: {0}".format(genout[0]), "cyan"))
+            response = app.response_class(
+              response=json.dumps({"data": {"text": genout[0]}}),
+              status=200,
+              mimetype='application/json'
+            )
+          else:
+            # New client format with numseq support
+            i = 0
+            for seq in genout:
+              print(colored("[Result {0}]\n{1}".format(i, seq), "cyan"))
+              i += 1
+            response = app.response_class(
+              response=json.dumps({"data": {"seqs": genout}}),
+              status=200,
+              mimetype='application/json'
+            )
+
+          return response
+        else:
+          print(colored("[ERROR] Something went wrong during generation!", "red"))
+          response = app.response_class(
+            response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation!"}}}),
+            status=400,
+            mimetype='application/json'
+          )
+
+        js         = {}
+        tokens     = []
+        ids        = []
+        gen_tokens = []
+        genout     = ""
+        response   = {}
+
+      except Exception as e:
+        print(colored("[ERROR] Something went wrong during generation!", "red"))
+        print(colored("{0}".format(e), "red"))
+        response = app.response_class(
+          response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation! {0}".format(e)}}}),
+          status=400,
+          mimetype='application/json'
+        )
+
+@app.post("/engines/completions")
+async def read_completions(
+#engine_id:str,
+		prompt:Optional[str] = None,
+		max_tokens: Optional[int]=16,
+		temperature: Optional[float]=1.0,
+		top_p:Optional[float]=1.0,
+		top_k:Optional[int]=40,
+		n:Optional[int]=1,
+		stream:Optional[bool]=False,
+		logprobs:Optional[int]=None,
+		echo:Optional[bool]=False,
+		stop:Optional[list]=None,
+		presence_penalty:Optional[float]=0.0001,
+		repetition_penalty:Optional[float]=1.0000,
+		best_of:Optional[int]=1,
+		recursive_depth:Optional[int]=0,
+		recursive_refresh:Optional[int]=0,
+		logit_bias:Optional[Dict[str,float]]=None,
+		request: Request=None
+    ):
+    global active_model,model,tokenizer
+    response={}
+    response['params']= dict(request.query_params)
+    print(response)
+
+    text = str(prompt)
+    text = text.replace("|","\r\n")
+    prompt_len = len(text)	
+    ids = tokenizer(text, return_tensors="pt").input_ids.to(runtime_gpu)
+    max_length = max_tokens + ids.shape[1]
+    do_sample=True
+    use_cache=True
+    start = time.time()
+    num_return_sequences=n
+    num_beams = n
+    num_beam_groups=n
+
+    if (recursive_depth== 0):
+	    gen_tokens = model.generate(
+		ids,
+		do_sample=True,
+		min_length=max_length,
+		max_length=max_length,
+		temperature=temperature,
+		use_cache=True,
+		num_beams = num_beams,
+		num_return_sequences=num_return_sequences,
+		#        num_beam_groups=num_beam_groups,
+		#        early_stopping=True,
+		top_p=top_p,
+
+		#        top_k=50,
+		repetition_penalty =repetition_penalty,
+		no_repeat_ngram_size=6,
+		max_time=60
+    		)
+    else:
+        gen_tokens = []
+        # do it serial until we figure out parallel for recursive
+        for x in range(num_return_sequences):
+            ref_text = str(text)
+            gen_tokens.append( recursive_infer(initial_context=str(ref_text),
+		current_context=None, 
+		top_p=top_p,top_k=top_k, temp=temperature,
+		gen_len=max_length,
+		depth=0, 
+		max_depth = recursive_depth,
+		recursive_refresh=recursive_refresh,
+		repetition_penalty=repetition_penalty
+		))
+
+    last_prompt=text
+    choices=[]
+    gen_text=''
+
+    for i,out_seq in enumerate(gen_tokens):
+        choice={}
+        choice['prompt']=last_prompt
+
+        if (recursive_depth== 0):
+            choice['text']=tokenizer.decode(out_seq, skip_special_tokens=True)
+        else:
+            choice['text']=out_seq
+
+        choice['index']=i
+        choice['logprobs']=None
+        choice['finish_reason']='length'
+        choices.append(choice)
+        print("GenText[{}]:{}".format(i,choice['text']))
+        gen_text = gen_text + choice['text']
+
+        if (recursive_depth==0):
+            last_prompt = text
+        else:
+            last_prompt = text
+            #last_prompt = out_seq
+            #if (recursive_refresh==1):
+            #    last_prompt = text +"\r\n ... \r\n"+out_seq
+
+
+
+    #gen_text = tokenizer.batch_decode(gen_tokens)[0]
+    fin = time.time()
+    elapsed = fin - start
+    cps = (len(gen_text)-prompt_len) / elapsed
+
+    print("elapsed:{} len:{} cps:{}".format(elapsed,len(gen_text),cps))
+
+    response['id']=''
+    response['object']='text_completion'
+    response['created']=''
+    response['model']= 'GPT-J-6B_HF' #args.model
+    response['choices']=choices
+
+
+    return(response)
+
+print(colored("Model startup complete! Starting web service....", "green"))
+# Setting an auth token allows us to open multiple
+# tunnels at the same time
+if (NGROK_AUTH_TOKEN is not None) and not ("xxxxxx" in NGROK_AUTH_TOKEN ) :
+    ngrok.set_auth_token(NGROK_AUTH_TOKEN)
+
+public_url = ngrok.connect(SERVER_PORT)
+print(colored("Public_URL = "+str(public_url), "cyan"))
+nest_asyncio.apply()
+#app.run()   
+#if __name__ == "__main__":
+print(colored("Ready to Serve!", "green"))
+
+uvicorn.run(app, host="0.0.0.0", port=SERVER_PORT)
+print (colored("Happy Service!", "green"))
+
+# http://localhost:9995/docs#/default/read_completions_engines_completions_post
+# http://<NGROK_URL_ID>.ngrok.io/docs#/default/read_completions_engines_completions_post
+# http://<NGROK_URL_ID>.ngrok.io/docs#/default/koboldrequest_request_post
+
No results found