import os os.environ["RWKV_V7_ON"] = '1' os.environ["RWKV_JIT_ON"] = '1' os.environ["RWKV_CUDA_ON"] = '0' # <--- Add here to disable GPU from rwkv.model import RWKV import gc, re import gradio as gr import base64 from io import BytesIO import torch import torch.nn.functional as F from datetime import datetime from huggingface_hub import hf_hub_download from pynvml import * nvmlInit() gpu_h = nvmlDeviceGetHandleByIndex(0) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ctx_limit = 4000 gen_limit = 1000 ########################## text rwkv ################################################################ from rwkv.utils import PIPELINE, PIPELINE_ARGS title_v6 = "rwkv7-g1-2.9b-20250519-ctx4096" model_path_v6 = hf_hub_download(repo_id="BlinkDL/rwkv7-g1", filename=f"{title_v6}.pth") model_v6 = RWKV(model=model_path_v6.replace('.pth',''), strategy='cpu fp32') pipeline_v6 = PIPELINE(model_v6, "rwkv_vocab_v20230424") args = model_v6.args penalty_decay = 0.996 def generate_prompt(instruction, input=""): instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n') input = input.strip().replace('\r\n','\n').replace('\n\n','\n') if input: return f"""Instruction: {instruction}\n\nInput: {input}\n\nResponse:""" else: return f"""User: {instruction}\n\nAssistant:""" def qa_prompt(instruction): instruction = instruction.strip().replace('\r\n','\n') instruction = re.sub(r'\n+', '\n', instruction) return f"User: {instruction}\n\nAssistant:""" def evaluate( ctx, token_count=200, temperature=1.0, top_p=0.7, presencePenalty = 0.1, countPenalty = 0.1, ): args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p), alpha_frequency = countPenalty, alpha_presence = presencePenalty, token_ban = [], # ban the generation of some tokens token_stop = [0]) # stop generation whenever you see any token here ctx = ctx.strip() all_tokens = [] out_last = 0 out_str = '' occurrence = {} state = None for i in range(int(token_count)): input_ids = pipeline_v6.encode(ctx)[-ctx_limit:] if i == 0 else [token] out, state = model_v6.forward(input_ids, state) for n in occurrence: out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency) token = pipeline_v6.sample_logits(out, temperature=args.temperature, top_p=args.top_p) if token in args.token_stop: break all_tokens += [token] for xxx in occurrence: occurrence[xxx] *= penalty_decay ttt = pipeline_v6.decode([token]) www = 1 if ttt in ' \t0123456789': www = 0 #elif ttt in '\r\n,.;?!"\':+-*/=#@$%^&_~|<>\\()[]{},。;“”:?!()【】': # www = 0.5 if token not in occurrence: occurrence[token] = www else: occurrence[token] += www tmp = pipeline_v6.decode(all_tokens[out_last:]) if '\ufffd' not in tmp: out_str += tmp yield out_str.strip() out_last = i + 1 gpu_info = nvmlDeviceGetMemoryInfo(gpu_h) timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print(f'{timestamp} - vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}') del out del state gc.collect() torch.cuda.empty_cache() yield out_str.strip() examples = [ ["User: simulate SpaceX mars landing using python\n\nAssistant: \n

{title_v6}

\n") with gr.Tab("=== Base Model (Raw Generation) ==="): gr.Markdown(f'This is [RWKV7 G1](https://huggingface.co/BlinkDL/rwkv7-g1) 2.9B reasoning base LM - an attention-free pure RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM). Try topp0 penalty0 for math/code/translation. Supports 100+ world languages and code. Check [400+ Github RWKV projects](https://github.com/search?o=desc&p=1&q=rwkv&s=updated&type=Repositories). *** Can try examples (bottom of page) *** (can edit them). Demo limited to ctxlen {ctx_limit}.') with gr.Row(): with gr.Column(): prompt = gr.Textbox(lines=6, label="Prompt", value="User: simulate SpaceX mars landing using python\n\nAssistant: