= array(); } $post['classname'] = 'post'; } function comment_format(&$post) { global $conf, $uid, $gid, $forumlist; if (empty($post)) return; $forum = $post['fid'] ? forum_read($post['fid']) : ''; $thread = well_thread_read_cache($post['tid']); if ($thread) { //$post['fid'] = $thread['fid']; $post['closed'] = $thread['closed']; $post['subject'] = $thread['subject']; $post['url'] = $thread['url']; } else { $post['closed'] = 0; $post['subject'] = lang('thread_not_exists'); $post['url'] = ''; } $post['create_date_fmt'] = humandate($post['create_date']); //$post['message'] = stripslashes(htmlspecialchars_decode($post['message'])); $user = user_read_cache($post['uid']); $post['username'] = array_value($user, 'username'); $post['user_avatar_url'] = array_value($user, 'avatar_url'); $post['user'] = $user ? user_safe_info($user) : user_guest(); isset($post['floor']) || $post['floor'] = 0; // 权限判断 $post['allowupdate'] = 2 == array_value($forum, 'comment', 0) && ($uid == $post['uid'] || forum_access_mod($post['fid'], $gid, 'allowupdate')); $post['allowdelete'] = group_access($gid, 'allowuserdelete') && $uid == $post['uid'] || forum_access_mod($post['fid'], $gid, 'allowdelete'); $post['user_url'] = url('user-' . $post['uid'] . ($post['uid'] ? '' : '-' . $post['pid'])); if ($post['files'] > 0) { list($attachlist, $imagelist, $filelist) = well_attach_find_by_pid($post['pid']); // 使用图床 评论使用图床,mysql会过多,写死链接到内容是减轻mysql的过多的方法 if (2 == $conf['attach_on']) { foreach ($imagelist as $key => $attach) { $url = $conf['upload_url'] . 'website_attach/' . $attach['filename']; // 替换成图床 $post['message'] = FALSE !== strpos($post['message'], $url) && $attach['image_url'] ? str_replace($url, $attach['image_url'], $post['message']) : $post['message']; } } $post['filelist'] = $filelist; } else { $post['filelist'] = array(); } $post['classname'] = 'post'; } function comment_format_message(&$val) { global $conf; if (empty($val)) return; // 使用云储存 if (1 == $conf['attach_on'] && 1 == $val['attach_on']) { $val['message'] = str_replace('="upload/', '="' . file_path($val['attach_on']), $val['message']); } elseif (2 == $conf['attach_on'] && 2 == $val['attach_on']) { // 使用图床 list($attachlist, $imagelist, $filelist) = well_attach_find_by_tid($val['tid']); foreach ($imagelist as $key => $attach) { $url = $conf['upload_url'] . 'website_attach/' . $attach['filename']; // 替换成图床 $val['message'] = FALSE !== strpos($val['message'], $url) && $attach['image_url'] ? str_replace($url, $attach['image_url'], $val['message']) : $val['message']; } } else { $val['message'] = str_replace('="upload/', '="' . file_path($val['attach_on']), $val['message']); } //$val['message'] = stripslashes(htmlspecialchars_decode($val['message'])); } // 把内容中使用了云储存的附件链接替换掉 function comment_message_replace_url($pid, $message) { global $conf; if (0 == $conf['attach_on']) { $message = FALSE !== strpos($message, '="../upload/') ? str_replace('="../upload/', '="upload/', $message) : $message; $message = FALSE !== strpos($message, '="/upload/') ? str_replace('="/upload/', '="upload/', $message) : $message; } elseif (1 == $conf['attach_on']) { // 使用云储存 $message = str_replace('="' . $conf['cloud_url'] . 'upload/', '="upload/', $message); } elseif (2 == $conf['attach_on']) { // 使用图床 评论使用图床,mysql会过多,写死链接到内容是减轻mysql的过多的方法 list($attachlist, $imagelist, $filelist) = well_attach_find_by_pid($pid); foreach ($imagelist as $key => $attach) { $url = $conf['upload_url'] . 'website_attach/' . $attach['filename']; // 替换回相对链接 $message = $attach['image_url'] && FALSE !== strpos($message, $attach['image_url']) ? str_replace($attach['image_url'], $url, $message) : $message; } } return $message; } function comment_filter($val) { unset($val['userip']); return $val; } function comment_highlight_keyword($str, $k) { $r = str_ireplace($k, '' . $k . '', $str); return $r; } // //
function comment_message_format(&$s) { if (xn_strlen($s) < 100) return; $s = preg_replace('#.*?
#is', '', $s); $s = str_ireplace(array('
', '
', '
', '

', '', '', '', '' . ''), "\r\n", $s); $s = str_ireplace(array(' '), " ", $s); $s = strip_tags($s); $s = preg_replace('#[\r\n]+#', "\n", $s); $s = xn_substr(trim($s), 0, 100); $s = str_replace("\n", '
', $s); } // 对内容进行引用 function comment_quote($quotepid) { $quotepost = comment_read($quotepid); if (empty($quotepost)) return ''; $uid = $quotepost['uid']; $s = $quotepost['message']; $s = comment_brief($s, 100); $userhref = url('user-' . $uid); $user = user_read_cache($uid); $r = '
' . $user['username'] . ' ' . $s . '
'; return $r; } // 获取内容的简介 0: html, 1: txt; 2: markdown; 3: ubb function comment_brief($s, $len = 100) { $s = strip_tags($s); $s = htmlspecialchars($s); $more = xn_strlen($s) > $len ? ' ... ' : ''; $s = xn_substr($s, 0, $len) . $more; return $s; } ?>transformers
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

transformers

运维笔记admin36浏览0评论

transformers

transformers

://huggingface.co/docs/transformers/main/en/autoclass_tutorialAutoClass可以自动推断和加载给定checkpoint的正确架构。

对于文本,使用Tokenizer将文本转换为token序列,创建token的数字表示,并将它们组装成张量。 对于语音和音频,使用Feature extractor从音频波形中提取连续的特征,并将其转换为张量。 图像输入使用ImageProcessor将图像转换为张量。 对于多模态输入,使用Processor结合分词器和特征提取器或图像处理器。

1.AutoTokenizer

from transformers import AutoTokenizertokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
print(encoded_input)
{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102],'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

预处理文本数据的主要工具是分词器,分词器根据一组规则将文本拆分为标记,将标记转换为数字和张量,成为模型输入。如果使用预训练模型,使用相关的预训练分词器非常重要。确保文本的拆分方式与预训练语料库相同,并在预训练期间使用相应的标记索引(词汇表)。

attention_mask是一个与输入序列长度相同的二进制向量。它指示哪些位置是真实的标记,而哪些位置是填充标记。填充标记通常用于确保不同长度的句子能够对齐并进行批处理。通过将填充标记的attention_mask设置为0,模型在计算注意力时可以忽略这些位置。这样可以提高计算效率,并确保模型正确处理真实标记。token_type_ids是一个与输入序列长度相同的整数向量,用于区分不同的序列。当输入包含多个序列时,例如问答任务中的问题和回答序列,token_type_ids用于区分哪些标记属于问题序列,哪些属于回答序列。这有助于模型理解和区分不同序列之间的关系。总之,attention_mask用于指示填充标记,以便模型可以忽略它们,而token_type_ids用于区分不同的序列,以便模型可以适当地处理它们。

tokenizer.decode(encoded_input["input_ids"])
'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'

分词器向句子中添加了两个特殊标记 - CLS和SEP(分类器和分隔符)。并非所有模型都需要特殊标记,但如果需要,分词器会自动为您添加它们。

1.1 pad

句子的长度并不总是相同的,张量(模型输入)需要具有统一的形状。填充是确保张量矩形化的一种策略,它通过向较短的句子添加特殊填充标记来实现。将padding参数设置为True,可以将批中较短的序列填充到与最长序列相匹配的长度。

batch_sentences = ["But what about second breakfast?","Don't think he knows about second breakfast, Pip.","What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True)
print(encoded_input)
{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],[101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],[101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}

1.2 truncation

有时序列可能太长,以至于模型无法处理。在这种情况下,需要将序列截断为更短的长度。将truncation参数设置为True,可以将序列截断到模型接受的最大长度:

batch_sentences = ["But what about second breakfast?","Don't think he knows about second breakfast, Pip.","What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
print(encoded_input)

1.3 build tensors

batch_sentences = ["But what about second breakfast?","Don't think he knows about second breakfast, Pip.","What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
print(encoded_input)

2.AutoImageProcessor

from transformers import AutoImageProcessorimage_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")

3.AutoFeatureExtractor

from transformers import AutoFeatureExtractorfeature_extractor = AutoFeatureExtractor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
)

4.AutoProcessor

LayoutLMv2需要一个image procssor处理图像,一个tokenizer处理文本,AutoProcessor将两者结合起来。

from transformers import AutoProcessorprocessor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")

5.AutoModel

加载给定任务的预训练模型,

from transformers import AutoModelForSequenceClassificationmodel = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

发布评论

评论列表(0)

  1. 暂无评论