Add pipeline tag, link to paper

#1
by nielsr HF staff - opened
Files changed (1) hide show
  1. README.md +81 -3
README.md CHANGED
@@ -1,3 +1,81 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: transformers
4
+ pipeline_tag: text-generation
5
+ ---
6
+ # LongVU
7
+
8
+ Play with the model on the [HF demo](https://huggingface.co/spaces/Vision-CAIR/LongVU).
9
+
10
+ <div align="left">
11
+ <a href='https://vision-cair.github.io/LongVU'><img src="https://longvu.s3.amazonaws.com/assets/demo.gif" alt="Demo GIF" style="width: 100%; max-width: 650px;"></a>
12
+ </div>
13
+
14
+ This repository contains the model described in [SemiKong: Curating, Training, and Evaluating A Semiconductor Industry-Specific Large Language Model](https://huggingface.co/papers/2411.13802).
15
+
16
+ For code see https://github.com/aitomatic/semikong
17
+
18
+ # Use
19
+
20
+ We provide the simple generation process for using our model. For more details, you could refer to [Github](https://github.com/Vision-CAIR/LongVU)
21
+
22
+ ```python
23
+ # git clone https://github.com/Vision-CAIR/LongVU
24
+ import numpy as np
25
+ import torch
26
+ from longvu.builder import load_pretrained_model
27
+ from longvu.constants import (
28
+ DEFAULT_IMAGE_TOKEN,
29
+ IMAGE_TOKEN_INDEX,
30
+ )
31
+ from longvu.conversation import conv_templates, SeparatorStyle
32
+ from longvu.mm_datautils import (
33
+ KeywordsStoppingCriteria,
34
+ process_images,
35
+ tokenizer_image_token,
36
+ )
37
+ from decord import cpu, VideoReader
38
+
39
+ tokenizer, model, image_processor, context_len = load_pretrained_model(
40
+ "./checkpoints/longvu_qwen", None, "cambrian_qwen",
41
+ )
42
+
43
+ model.eval()
44
+ video_path = "./examples/video1.mp4"
45
+ qs = "Describe this video in detail"
46
+
47
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
48
+ fps = float(vr.get_avg_fps())
49
+ frame_indices = np.array([i for i in range(0, len(vr), round(fps),)])
50
+ video = []
51
+ for frame_index in frame_indices:
52
+ img = vr[frame_index].asnumpy()
53
+ video.append(img)
54
+ video = np.stack(video)
55
+ image_sizes = [video[0].shape[:2]]
56
+ video = process_images(video, image_processor, model.config)
57
+ video = [item.unsqueeze(0) for item in video]
58
+
59
+ qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
60
+ conv = conv_templates["qwen"].copy()
61
+ conv.append_message(conv.roles[0], qs)
62
+ conv.append_message(conv.roles[1], None)
63
+ prompt = conv.get_prompt()
64
+
65
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(model.device)
66
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
67
+ keywords = [stop_str]
68
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
69
+ with torch.inference_mode():
70
+ output_ids = model.generate(
71
+ input_ids,
72
+ images=video,
73
+ image_sizes=image_sizes,
74
+ do_sample=False,
75
+ temperature=0.2,
76
+ max_new_tokens=128,
77
+ use_cache=True,
78
+ stopping_criteria=[stopping_criteria],
79
+ )
80
+ pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
81
+ ```