{"id":6353,"date":"2024-04-02T14:19:42","date_gmt":"2024-04-02T06:19:42","guid":{"rendered":"https:\/\/blog.deephour.com\/?p=6353"},"modified":"2024-04-02T14:19:42","modified_gmt":"2024-04-02T06:19:42","slug":"%e5%a4%9a%e6%a8%a1%e6%80%81%e5%a4%a7%e6%a8%a1%e5%9e%8b-clip-blip-blip2-llava-minigpt4-instructblip-%e7%b3%bb%e5%88%97%e8%a7%a3%e8%af%bb%e8%bd%ac%e8%bd%bd","status":"publish","type":"post","link":"https:\/\/blog.deephour.com\/index.php\/2024\/04\/02\/%e5%a4%9a%e6%a8%a1%e6%80%81%e5%a4%a7%e6%a8%a1%e5%9e%8b-clip-blip-blip2-llava-minigpt4-instructblip-%e7%b3%bb%e5%88%97%e8%a7%a3%e8%af%bb%e8%bd%ac%e8%bd%bd\/","title":{"rendered":"\u591a\u6a21\u6001\u5927\u6a21\u578b CLIP, BLIP, BLIP2, LLaVA, miniGPT4, InstructBLIP \u7cfb\u5217\u89e3\u8bfb(\u8f6c\u8f7d)"},"content":{"rendered":"\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"512\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/\u591a\u6a21\u6001-1024x512.png\" alt=\"\" class=\"wp-image-6354\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/\u591a\u6a21\u6001-1024x512.png 1024w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/\u591a\u6a21\u6001-300x150.png 300w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/\u591a\u6a21\u6001-768x384.png 768w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/\u591a\u6a21\u6001.png 1440w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p>\u4f5c\u8005\uff1a\u69b4\u83b2\u9165<br>\u94fe\u63a5\uff1ahttps:\/\/zhuanlan.zhihu.com\/p\/653902791<br>\u6765\u6e90\uff1a\u77e5\u4e4e<br>\u8457\u4f5c\u6743\u5f52\u4f5c\u8005\u6240\u6709\u3002\u5546\u4e1a\u8f6c\u8f7d\u8bf7\u8054\u7cfb\u4f5c\u8005\u83b7\u5f97\u6388\u6743\uff0c\u975e\u5546\u4e1a\u8f6c\u8f7d\u8bf7\u6ce8\u660e\u51fa\u5904\u3002<\/p>\n\n\n\n<p>\u89c6\u89c9+\u8bed\u8a00\u7684\u591a\u6a21\u6001\u5927\u6a21\u578b\u76ee\u524d\u4e3b\u6d41\u65b9\u6cd5\u662f\uff1a\u501f\u52a9\u9884\u8bad\u7ec3\u597d\u7684\u5927\u8bed\u8a00\u6a21\u578b\u548c\u56fe\u50cf\u7f16\u7801\u5668\uff0c\u7528\u4e00\u4e2a<strong>\u56fe\u6587\u7279\u5f81\u5bf9\u9f50\u6a21\u5757<\/strong>\u6765\u8fde\u63a5\uff0c\u4ece\u800c\u8ba9\u8bed\u8a00\u6a21\u578b\u7406\u89e3\u56fe\u50cf\u7279\u5f81\u5e76\u8fdb\u884c\u66f4\u6df1\u5c42\u7684\u95ee\u7b54\u63a8\u7406\u3002<\/p>\n\n\n\n<p>\u8fd9\u6837\u53ef\u4ee5\u5229\u7528\u5df2\u6709\u7684\u5927\u91cf\u5355\u6a21\u6001\u8bad\u7ec3\u6570\u636e\u8bad\u7ec3\u5f97\u5230\u7684\u5355\u6a21\u6001\u6a21\u578b\uff0c\u51cf\u5c11\u5bf9\u4e8e\u9ad8\u8d28\u91cf\u56fe\u6587\u5bf9\u6570\u636e\u7684\u4f9d\u8d56\uff0c\u5e76\u901a\u8fc7<strong>\u7279\u5f81\u5bf9\u9f50\u3001\u6307\u4ee4\u5fae\u8c03<\/strong>\u7b49\u65b9\u5f0f\u6253\u901a\u4e24\u4e2a\u6a21\u6001\u7684\u8868\u5f81\u3002<\/p>\n\n\n\n<p>\u4e3b\u8981\u7684\u51e0\u4e2a\u65b9\u6cd5\u68b3\u7406\u5982\u4e0b\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"608\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-20.jpeg\" alt=\"\" class=\"wp-image-6377\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-20.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-20-300x253.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p>\u63a5\u4e0b\u6765\u4f1a\u9010\u4e00\u8bb2\u89e3\u8fd9\u51e0\u4e2a\u65b9\u6cd5\u5bf9\u5e94\u7684\u8bba\u6587\u7ec6\u8282\uff0c\u4ee5\u53ca\u6700\u540e\u8865\u5145\u4e86MME\u8fd9\u7bc7\u5de5\u4f5c\u4e2d\u5bf9\u591a\u6a21\u6001\u5927\u6a21\u578b\u8fdb\u884c\u7efc\u5408\u8bc4\u4f30\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">CLIP (Contrastive Language-Image Pre-training)<\/h2>\n\n\n\n<p><strong>\u57fa\u672c\u601d\u60f3<\/strong>\uff1aCLIP\u7684\u57fa\u672c\u7b97\u6cd5\u539f\u7406\u662f\u6587\u672c\u548c\u56fe\u50cf\u5728\u7279\u5f81\u57df\u8fdb\u884c\u5bf9\u9f50\u3002<\/p>\n\n\n\n<p><strong>\u6a21\u578b\u7ed3\u6784<\/strong>\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u4e3a\u4e86\u5bf9image\u548ctext\u5efa\u7acb\u8054\u7cfb\uff0c\u9996\u5148\u5206\u522b\u5bf9image\u548ctext\u8fdb\u884c\u7279\u5f81\u63d0\u53d6\uff0cimage\u7279\u5f81\u63d0\u53d6\u7684backbone\u53ef\u4ee5\u662fresnet\u7cfb\u5217\u6a21\u578b\u4e5f\u53ef\u4ee5\u662fVIT\u7cfb\u5217\u6a21\u578b\uff1btext\u7279\u5f81\u63d0\u53d6\u76ee\u524d\u4e00\u822c\u91c7\u7528bert\u6a21\u578b\u3002<\/li>\n\n\n\n<li>\u7279\u5f81\u63d0\u53d6\u4e4b\u540e\uff0c\u7531\u4e8e\u505a\u4e86normalize\uff0c\u76f4\u63a5\u76f8\u4e58\u6765\u8ba1\u7b97\u4f59\u5f26\u8ddd\u79bb\uff0c\u540c\u4e00pair\u5bf9\u7684\u7ed3\u679c\u8d8b\u8fd1\u4e8e1\uff0c\u4e0d\u540cpair\u5bf9\u7684\u7ed3\u679c\u8d8b\u8fd1\u4e8e0\uff0c\u56e0\u4e3a\u5c31\u53ef\u4ee5\u91c7\u7528\u5bf9\u6bd4\u635f\u5931loss\uff08info-nce-loss\uff09\u3010\u8fd9\u91cc\u8981\u6bd4\u8f83\u5927\u7684batch size\u624d\u80fd\u6709\u6548\u679c\uff0c\u7c7b\u4f3c\u4e8e\u7ef4\u62a4\u4e00\u4e2a\u5927\u7684\u7279\u5f81\u76f8\u4f3c\u5ea6\u77e9\u9635\u3011<\/li>\n<\/ul>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"534\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-10.jpeg\" alt=\"\" class=\"wp-image-6366\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-10.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-10-300x223.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><figcaption class=\"wp-element-caption\">\u5bf9\u6bd4\u5b66\u4e60\uff0c\u7ef4\u62a4\u4e00\u4e2a\u56fe\u6587\u7279\u5f81\u76f8\u4f3c\u5ea6\u77e9\u9635<\/figcaption><\/figure>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u5229\u7528clip\u8fdb\u884c\u56fe\u50cf\u5206\u7c7b\u6709\u4e24\u79cd\u65b9\u5f0f\uff0c\u4e00\u79cd\u662f\u76f4\u63a5\u5229\u7528zero-shot \u65b9\u5f0f\u8fdb\u884c\u9884\u6d4b\uff0c\u5982\u4e0b\u56fe\u6240\u793a\uff0c\u5c06text\u5047\u8bbe\u4e3a a photo of [object], \u5206\u522b\u5bf9image \u548c text\u8fdb\u884c\u7279\u5f81\u63d0\u53d6\u4ee5\u53ca\u4f59\u5f26\u8ddd\u79bb\uff0c\u5f53object\u4e3a\u76ee\u6807\u7c7b\u522b\u65f6\uff0c\u76f8\u4f3c\u5ea6\u6700\u9ad8\uff0c\u5373\u4e3a\u9884\u6d4b\u7ed3\u679c\uff08\u6548\u679c\u597d\uff09\uff1b\u8fd8\u6709\u4e00\u79cd\u65b9\u5f0f\u5c31\u662f\u518d\u91cd\u65b0finetune\uff0c\u540c\u6837\u4e5f\u662f\u5bf9\u7c7b\u522b\u8bbe\u8ba1\u51e0\u79cd\u4e0d\u540c\u7684\u6587\u672c\uff0c\u8fd9\u6837\u6548\u679c\u80fd\u591f\u8fbe\u5230sota\u7684\u6c34\u5e73\uff01<\/li>\n<\/ul>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"506\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-6.jpeg\" alt=\"\" class=\"wp-image-6362\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-6.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-6-300x211.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><figcaption class=\"wp-element-caption\">\u76ee\u6807\u7c7b\u522b\u5bf9\u5e94\u7684\u56fe\u6587\u7279\u5f81\u4f59\u5f26\u8ddd\u79bb\u6700\u5927<\/figcaption><\/figure>\n\n\n\n<p><br><br><strong>\u8bad\u7ec3loss<\/strong>\uff1a\u53c2\u8003\u5bf9\u6bd4\u5b66\u4e60\u635f\u5931\uff0c\u91c7\u7528\u4e86info-nce-loss\u3002<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u4ea4\u53c9\u71b5\u4ee3\u4ef7\u51fd\u6570\uff08cross entropy\uff09\uff1a\u6700\u57fa\u7840\u7684\u6709\u76d1\u7763\u5b66\u4e60\u591a\u5206\u7c7b\u635f\u5931\u51fd\u6570\uff0cgt\u662fn\u4e2a\u7c7b\u522b\u7684one-hot\u7f16\u7801\uff0c\u76ee\u6807\u662f\u6700\u5c0f\u5316gt\u7684one-hot\u6807\u7b7e\u548c\u9884\u6d4blogits\u7684\u8d1f\u5bf9\u6570\u4e58\u79ef\u5728\u591a\u4e2a\u7c7b\u522b\u4e0a\u7684\u52a0\u548c\uff0c\u4ece\u4fe1\u606f\u8bba\u7684\u89d2\u5ea6\u4e5f\u5c31\u662f\u6700\u5c0f\u5316\u6a21\u578b\u6570\u636e\u5206\u5e03\u4e0e\u8bad\u7ec3\u6570\u636e\u4e4b\u95f4\u7684KL\u6563\u5ea6\uff1b<\/li>\n<\/ul>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"511\" height=\"95\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-3.jpeg\" alt=\"\" class=\"wp-image-6359\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-3.jpeg 511w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-3-300x56.jpeg 300w\" sizes=\"auto, (max-width: 511px) 100vw, 511px\" \/><figcaption class=\"wp-element-caption\">n\u4e2a\u7c7b\u522b\u591a\u5206\u7c7b\u7684\u4ea4\u53c9\u71b5\u4ee3\u4ef7\u51fd\u6570<\/figcaption><\/figure>\n\n\n\n<ul class=\"wp-block-list\">\n<li>NCE\uff08noise contrastive estimation\uff09\uff1a\u548c\u4ea4\u53c9\u71b5\u7c7b\u4f3c\uff0c\u4f46\u662f\u628a\u591a\u5206\u7c7b\u95ee\u9898\u8f6c\u5316\u6210\u4e86\u4e8c\u5206\u7c7b\u95ee\u9898\uff0c\u4e00\u4e2a\u7c7b\u662f\u6570\u636e\u7c7b\u522b data sample\uff0c\u53e6\u4e00\u4e2a\u7c7b\u662f\u566a\u58f0\u7c7b\u522b noisy sample\uff0c\u76ee\u6807\u662f\u5b66\u4e60\u6570\u636e\u6837\u672c\u548c\u566a\u58f0\u6837\u672c\u4e4b\u95f4\u7684\u533a\u522b\uff0c\u4e5f\u5c31\u662f\u201c\u566a\u58f0\u5bf9\u6bd4\uff08noise contrastive\uff09 <a href=\"https:\/\/zhuanlan.zhihu.com\/p\/506544456\">https:\/\/zhuanlan.zhihu.com\/p\/506544456<\/a><\/li>\n<\/ul>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter\"><img loading=\"lazy\" decoding=\"async\" width=\"622\" height=\"158\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image.jpeg\" alt=\"\" class=\"wp-image-6355\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image.jpeg 622w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-300x76.jpeg 300w\" sizes=\"auto, (max-width: 622px) 100vw, 622px\" \/><figcaption class=\"wp-element-caption\">\u566a\u58f0\u5bf9\u6bd4NCE-loss<\/figcaption><\/figure>\n<\/div>\n\n\n<ul class=\"wp-block-list\">\n<li>Info-NCE\uff1a\u662fNCE\u7684\u4e00\u4e2a\u7b80\u5355\u53d8\u4f53\uff0c\u628a\u566a\u58f0\u6837\u672c\u4ece\u4e00\u4e2a\u7c7b\u522b\u53c8\u5212\u5206\u4e3a\u591a\u4e2a\u7c7b\u770b\u5f85\u3002\u516c\u5f0f\u4e2d\u7684temp\u662f\u4e00\u4e2a\u6e29\u5ea6\u8d85\u53c2\u6570\uff08\u6807\u91cf\uff09\uff0c\u5982\u679c\u5ffd\u7565temp\uff0c\u90a3\u4e48infoNCE loss\u5176\u5b9e\u5c31\u662fcross entropy loss\uff0c\u53ea\u662f\u5728cross entropy loss\u91cc\uff0ck\u6307\u4ee3\u7684\u662f\u6570\u636e\u96c6\u91cc\u7c7b\u522b\u7684\u6570\u91cf\uff0c\u800cInfoNCE loss\u91cc\uff0ck\u6307\u7684\u662f\u8d1f\u6837\u672c\u7684\u6570\u91cf\u3002\u516c\u5f0f\u5206\u6bcd\u4e2d\u7684sum\u662f\u57281\u4e2a\u6b63\u6837\u672c\u548ck\u4e2a\u8d1f\u6837\u672c\u4e0a\u505a\u7684\uff0c\u505a\u7684\u662f\u4e00\u4e2ak+1\u7c7b\u7684\u5206\u7c7b\u4efb\u52a1\uff0c\u76ee\u7684\u5c31\u662f\u60f3\u628aquery\u8fd9\u4e2a\u56fe\u7247\u5206\u5230k+\u8fd9\u4e2a\u7c7b\u3002<\/li>\n<\/ul>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"190\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-1.jpeg\" alt=\"\" class=\"wp-image-6357\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-1.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-1-300x79.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><figcaption class=\"wp-element-caption\">Info-NCE loss<\/figcaption><\/figure>\n<\/div>\n\n\n<ul class=\"wp-block-list\">\n<li>\u6e29\u5ea6\u7cfb\u6570temperature(\u672c\u6587\u56fa\u5b9a0.07)\uff1a\u4f5c\u7528\u662f\u63a7\u5236logits\u7684\u5206\u5e03\u5f62\u72b6\uff0c\u5bf9\u4e8e\u65e2\u5b9a\u7684logits\u5206\u5e03\u7684\u5f62\u72b6\uff0c\u5f53temp\u503c\u53d8\u5927\uff0c\u5219(q*k)\/temp\u53d8\u5c0f\uff0c\u6307\u6570\u8fd0\u7b97\u4e4b\u540e\u66f4\u5c0f\uff0c\u5bfc\u81f4\u539f\u6765\u7684logits\u5206\u5e03\u66f4\u5e73\u6ed1\u3002\u76f8\u53cd\uff0c\u5982\u679ctemp\u53d6\u5f97\u503c\u5c0f\uff0c\u539f\u6765\u7684logits\u5206\u5e03\u91cc\u7684\u6570\u503c\u5c31\u76f8\u5e94\u7684\u53d8\u5927\uff0c\u6307\u6570\u8fd0\u7b97\u4e4b\u540e\u66f4\u5927\uff0c\u5219\u8fd9\u4e2a\u5206\u5e03\u53d8\u5f97\u66f4\u96c6\u4e2d\uff0c\u66f4peak\u3002\u9700\u8981\u53d6\u4e00\u4e2a\u5408\u9002\u7684\u6570\u503c\uff0c\u65e2\u4e0d\u4f1a\u5bf9\u6240\u6709\u7684\u8d1f\u6837\u672c\u4e00\u89c6\u540c\u4ec1\uff0c\u4e5f\u4e0d\u4f1a\u8fc7\u5ea6\u5173\u6ce8\u96be\u6837\u672c\u3002<\/li>\n<\/ul>\n\n\n\n<p><strong>\u7ec6\u8282<\/strong>\uff1aCLIP\u7528\u4e86\u5927\u91cf\u7684\u8bad\u7ec3\u6570\u636e\u4ee5\u53ca\u8bad\u7ec3\u8d44\u6e90\uff0c\u5927\u529b\u51fa\u5947\u8ff9\u3002CLIP\u7528\u4e86400million\u7684image-text pair\u5bf9\u8fdb\u884c\u8bad\u7ec3\uff0c\u5bf9\u4e8eimage backbone\uff0cCLIP\u5c1d\u8bd5\u4e86\u4e24\u79cd\u7ed3\u6784\uff0cDN50x64 \u548c vit-L\uff0c\u5206\u522b\u7528\u4e86592 \u4e2a V100 + 18\u5929 \u7684\u65f6\u95f4 \u548c 256 \u4e2a V100 + 12\u5929 \u7684\u65f6\u95f4\uff0c\u975e\u5927\u516c\u53f8\u76f4\u63a5\u529d\u9000\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">BLIP\uff08Bootstraping language image pre-training\uff09<\/h2>\n\n\n\n<p><strong>\u57fa\u672c\u601d\u60f3<\/strong>\uff1a<\/p>\n\n\n\n<p>\u517c\u987e\u56fe\u6587\u7406\u89e3\u548c\u751f\u6210\u7684\u591a\u6a21\u6001\u6a21\u578b\uff08Multimodal mixture of Encoder-Decoder\uff09\uff0c\u540c\u65f6\u5728\u4e09\u4e2a\u89c6\u89c9\u8bed\u8a00\u76ee\u6807\u4e0a\u8054\u5408\u9884\u8bad\u7ec3\uff1a\u56fe\u50cf\u6587\u672c\u5bf9\u6bd4\u5b66\u4e60ITC\u3001\u56fe\u50cf\u6587\u672c\u5339\u914dITM\u3001\u56fe\u50cf\u6761\u4ef6\u8bed\u8a00\u5efa\u6a21LM\uff1b\u540c\u65f6\u63d0\u51fa\u4e86\u4e00\u79cd\u9ad8\u6548\u5229\u7528\u7f51\u7edc\u6536\u96c6\u7684\u5608\u6742\u56fe\u6587\u5bf9\u7684\u91c7\u6837+\u8fc7\u6ee4\u673a\u5236\u3002<br>bootstraping\u7ffb\u8bd1\u6210\u201c\u81ea\u4e3e\u201d\u6709\u70b9\u522b\u626d\uff0c\u6211\u8fd8\u662f\u4e60\u60ef\u7406\u89e3\u4e3a\u6709\u653e\u56de\u62bd\u6837\/\u8fed\u4ee3\u4f18\u5316\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"416\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-12.jpeg\" alt=\"\" class=\"wp-image-6368\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-12.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-12-300x173.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p><br><strong>\u6a21\u578b\u7ed3\u6784<\/strong>\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u56fe\u50cf\u5757\u7684\u7f16\u7801\uff08ViT\uff09\uff1a\u56fe\u50cf\u6253\u6210patches\u5757\u540e\u8fdb\u884c\u7f16\u7801\uff0c\u589e\u52a0cls token\u6765\u8bb0\u5f55\u5168\u5c40\u7684\u7279\u5f81\uff08\u4f5c\u7528\u7c7b\u4f3c\u4f4d\u7f6e\u7f16\u7801\uff0c\u4fdd\u7559patches\u7684\u7a7a\u95f4\u7279\u5f81\uff09<\/li>\n\n\n\n<li>\u6587\u672c\u7684\u7f16\u7801\uff08BERT\uff09\uff1a\u5bf9\u53e5\u5b50\u8fdb\u884c\u7f16\u7801\uff0c\u589e\u52a0cls token\u8bb0\u5f55\u53e5\u5b50\u7684\u5168\u5c40\u7279\u5f81<\/li>\n\n\n\n<li>Image-grounded text encoder\uff1a\u5728\u6587\u672cembedding\u4e2d\u6ce8\u5165\u4e86\u56fe\u50cf\u7279\u5f81\uff0c\u901a\u8fc7\u5728self-attention\u548cFFN\u4e2d\u95f4\u589e\u52a0\u4e00\u5c42cross-attention\u6765\u5bf9\u9f50text-encoder\u548cimg-encoder\u7684\u7279\u5f81\u3002<\/li>\n\n\n\n<li>Image-grounded text decoder\uff1a\u7528causal self-attention\u5c42\uff08\u9884\u6d4b\u4e0b\u4e00\u4e2atoken\uff09\u4ee3\u66ff\u4e86\u53cc\u5411\u81ea\u6ce8\u610f\u529b\u5c42\uff08\u5efa\u7acb\u5f53\u524d\u8f93\u5165token\u7684\u8868\u8fbe\uff09\u3010\u548c\u5de6\u8fb9\u7684encoder\u5171\u4eab\u9664\u4e86self-attention\u4e4b\u5916\u7684\u5c42\u3011<\/li>\n<\/ul>\n\n\n\n<p><strong>\u8bad\u7ec3loss<\/strong>\uff1a<br>\u9884\u8bad\u7ec3\u9636\u6bb5\u540c\u65f6\u4f18\u53163\u4e2aloss\u9879\uff0c\u6bcf\u4e2a\u56fe\u6587\u5bf9\u53ea\u8fc71\u6b21vision-transormer(\u7b97\u529b\u6d88\u8017\u8f83\u5927)\uff0c\u8fc73\u6b21text-transormer<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Image-Text Contrastive Loss (ITC)\u7406\u89e3\u529f\u80fd\uff1a\u4f18\u5316vision-transormer+text-transormer\uff0c\u8ba9\u5339\u914d\u7684\u56fe\u6587\u5bf9\u6709\u8f83\u9ad8\u76f8\u4f3c\u5ea6\u7684\u8868\u8fbe\uff08\u7528\u4e86soft labels\uff09\uff0c\u591a\u6a21\u6001\u4e2d\u7684\u7ecf\u5178loss-&gt;\u4f7f\u5176<strong>\u4e92\u4fe1\u606f\u6700\u5927\u5316<\/strong>\uff1b<\/li>\n\n\n\n<li>Image-Text Matching Loss (ITM)\u7406\u89e3\u529f\u80fd \uff1a\u4f18\u5316Image-grounded text encoder\uff0c\u5b66\u4e60\u56fe\u6587\u7684<strong>\u7ec6\u7c92\u5ea6\u5339\u914d<\/strong>\u7684\u4e8c\u5206\u7c7b\uff0c\u91c7\u7528\u4e86hard negative mining strategy\uff1b<\/li>\n\n\n\n<li>Language Modeling Loss (LM)\u751f\u6210\u529f\u80fd\uff1a\u4f18\u5316image-grounded text decoder\uff0c\u5b66\u4e60\u5982\u4f55\u4ece\u7ed9\u5b9a\u56fe\u751f\u6210\u8fde\u8d2f\u7684\u6587\u672c\u63cf\u8ff0\uff0c\u91c7\u7528\u4ea4\u53c9\u71b5\u4ee3\u4ef7\u51fd\u6570\u4ee5\u81ea\u56de\u5f52\u65b9\u5f0f\u6700\u5927\u5316\u5bf9\u5e94\u6587\u672c\u6982\u7387\u3002<\/li>\n<\/ul>\n\n\n\n<p><br><br><strong>Captioning and Filtering<\/strong>\uff1a\u4e00\u79cd\u9ad8\u6548\u7684\u6570\u636e\u96c6\u589e\u5f3a\u65b9\u6cd5\uff0c\u4ece\u7f51\u9875\u566a\u58f0\u56fe\u50cf\u6587\u672c\u5bf9\u4e2d\u5b66\u4e60\uff0c\u91c7\u6837\u751f\u6210\u5668+\u566a\u58f0\u8fc7\u6ee4\u5668\uff0c\u90fd\u662f\u4ece\u76f8\u540c\u7684\u9884\u8bad\u7ec3\u6a21\u578b\u521d\u59cb\u5316\uff0c\u5e76\u5728\u5c0f\u578b\u4eba\u5de5\u6ce8\u91ca\u6570\u636e\u96c6e.g.COCO\u4e0a\u5355\u72ec\u8fdb\u884c\u5fae\u8c03\u3002<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>\u91c7\u6837\u751f\u6210\u5668captioner\uff1a\u57fa\u4e8e\u56fe\u50cf\u7684\u6587\u672c\u89e3\u7801\u5668\uff08\u751f\u6210\uff09\uff0c\u7528LM\u7684loss\u8fdb\u884c\u5fae\u8c03<\/li>\n\n\n\n<li>\u566a\u58f0\u8fc7\u6ee4\u5668filter\uff1a\u57fa\u4e8e\u56fe\u50cf\u7684\u6587\u672c\u7f16\u7801\u5668\uff08\u5224\u522b\uff09\uff0c\u7528ITC+ITM\u8fdb\u884c\u5fae\u8c03\uff0c\u5224\u65ad\u56fe\u6587\u662f\u5426\u5339\u914d<\/li>\n<\/ol>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"296\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-21.jpeg\" alt=\"\" class=\"wp-image-6378\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-21.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-21-300x123.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p><br><strong>\u6548\u679c<\/strong>\uff1a\u4ece\u7528\u4eba\u5de5\u6807\u6ce8\u7684\u56fe\u6587\u5bf9\u5f97\u5230\u7684\u9884\u8bad\u7ec3\u6a21\u578b\u5f00\u59cb\u521d\u59cb\u5316\uff0ccaptioner\u751f\u6210\u5408\u6210\u6587\u672c\uff0cfilter\u8fc7\u6ee4\u5408\u6210\u6587\u672c\u548c\u7f51\u9875\u63cf\u8ff0\u6587\u672c\uff0c\u5f97\u5230\u8fc7\u6ee4\u540e\u7684\u76f8\u5bf9\u5e72\u51c0\u7684\u7f51\u9875\u6587\u672c\u548c\u5408\u6210\u6587\u672c-&gt;\u7528\u6765\u8bad\u7ec3BLIP\uff0c\u5f80\u590d\u8fdb\u884c\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"349\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-24.jpeg\" alt=\"\" class=\"wp-image-6381\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-24.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-24-300x145.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p><br><strong>\u6570\u636e\u96c6<\/strong>\uff1a<br>\u56fe\u50cf\u7f16\u7801\u5668\u4f7f\u7528 ImageNet-1K \u4e0a\u9884\u8bad\u7ec3\u7684 ViT \u521d\u59cb\u5316\uff0c\u6587\u672c\u7f16\u7801\u5668\u4ee5 BERT-Base \u521d\u59cb\u5316\uff0c\u6309\u7167Vit-B 2880\/ Vit-L 2400\u7684batch-size\u8bad\u7ec3 20 Epochs\uff0c\u9884\u8bad\u7ec3\u9636\u6bb5\u56fe\u50cf\u5c3a\u5bf8224*224\uff0c\u5fae\u8c03\u9636\u6bb5\u5c3a\u5bf8384*384.<br>\u7528\u4e86\u4ee5\u4e0b3\u90e8\u5206\u6570\u636e\u96c6\u9884\u8bad\u7ec3\uff0c\u52a0\u8d77\u6765\u5927\u6982\u662f 14M\u3002<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>COCO<\/li>\n\n\n\n<li>Visual Genome<\/li>\n\n\n\n<li>\u7f51\u7edc\u6570\u636e\uff1aConceptual Captions\uff0cConceptual 12M\uff08\u566a\u58f0\u8f83\u5927\uff09\uff0cSBU Captions<\/li>\n<\/ul>\n\n\n\n<p>\u8fd8\u5c1d\u8bd5\u4e86\u4e00\u4e2a\u989d\u5916\u7684\u566a\u58f0\u6587\u672c\u8f83\u591a\u7684web \u6570\u636e\u96c6 LAION\uff08115M \u56fe\u50cf\uff09\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">BLIP2<\/h2>\n\n\n\n<p><br><strong>\u57fa\u672c\u601d\u60f3<\/strong>\uff1a<\/p>\n\n\n\n<p>\u5982\u6807\u9898\u6240\u8a00 Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models\uff0c\u5206\u4e24\u4e2a\u9636\u6bb5\uff0c\u901a\u8fc7\u5229\u7528\u9884\u8bad\u7ec3\u597d\u7684\u89c6\u89c9\u6a21\u578b\u548c\u8bed\u8a00\u6a21\u578b\u6765\u63d0\u5347\u591a\u6a21\u6001\u6548\u679c\u548c\u964d\u4f4e\u8bad\u7ec3\u6210\u672c\u3002<\/p>\n\n\n\n<p><strong>\u6a21\u578b\u7ed3\u6784<\/strong>\uff1a<\/p>\n\n\n\n<p>BLIP-2 \u7531\u9884\u8bad\u7ec3\u7684Image Encoder\uff0c\u9884\u8bad\u7ec3\u7684Large Language Model\uff0c\u548c\u4e00\u4e2a\u53ef\u5b66\u4e60\u7684 Q-Former \u7ec4\u6210\u3002<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Image Encoder\uff1a\u4ece\u8f93\u5165\u56fe\u7247\u4e2d\u63d0\u53d6\u89c6\u89c9\u7279\u5f81\uff0c\u5c1d\u8bd5\u4e86\u4e24\u79cd\u7f51\u7edc\u7ed3\u6784\uff0cCLIP \u8bad\u7ec3\u7684 ViT-L\/14\u548cEVA-CLIP\u8bad\u7ec3\u7684 ViT-g\/14\uff08\u53bb\u6389\u4e86\u6700\u540e\u4e00\u5c42\uff09\u3002<\/li>\n\n\n\n<li>Large Language Model\uff1a\u5927\u8bed\u8a00\u6a21\u578b\u8fdb\u884c\u6587\u672c\u751f\u6210\uff0c\u5c1d\u8bd5\u4e86\u63a5\u5165decoder-based LLM \u548c encoder-decoder-based LLM\u4e24\u79cd\u7ed3\u6784\u3002<\/li>\n\n\n\n<li>Q-Former\uff1a\u5f25\u8865\u89c6\u89c9\u548c\u8bed\u8a00\u4e24\u79cd\u6a21\u6001\u7684modality gap\uff0c\u53ef\u4ee5\u7406\u89e3\u4e3a\u56fa\u5b9a\u56fe\u50cf\u7f16\u7801\u5668\u548c\u56fa\u5b9aLLM\u4e4b\u95f4\u7684<strong>\u4fe1\u606f\u67a2\u7ebd<\/strong>\uff0c\u9009\u53d6\u6700\u6709\u7528\u7684\u89c6\u89c9\u7279\u5f81\u7ed9LLM\u6765\u751f\u6210\u6587\u672c\u3002<\/li>\n<\/ul>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"562\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-13.jpeg\" alt=\"\" class=\"wp-image-6370\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-13.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-13-300x234.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p>Q-Former\u7531Image Transformer\u548cText Transformer\u4e24\u4e2a\u5b50\u6a21\u5757\u6784\u6210\uff0c\u5b83\u4eec\u5171\u4eab\u76f8\u540c\u81ea\u6ce8\u610f\u529b\u5c42\u3002<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Image Transformer\uff1a\u901a\u8fc7\u548cimage encoder\u4ea4\u4e92\u6765\u63d0\u53d6\u89c6\u89c9\u7279\u5f81\uff0c\u8f93\u5165\u662f\u4e00\u7cfb\u5217\uff08\u6587\u4e2d\u7528\u768432\u4e2a*768\u957f\u5ea6\uff09\u53ef\u5b66\u4e60\u7684 Queries\uff0c\u8fd9\u4e9bQuery\u901a\u8fc7<strong>\u81ea\u6ce8\u610f\u529b\u5c42<\/strong>\u76f8\u4e92\u4ea4\u4e92\uff0c\u5e76\u901a\u8fc7<strong>\u4ea4\u53c9\u6ce8\u610f\u529b\u5c42<\/strong>\u4e0e\u51bb\u7ed3\u7684\u56fe\u50cf\u7279\u5f81\u4ea4\u4e92\uff0c\u8fd8\u53ef\u4ee5\u901a\u8fc7<strong>\u5171\u4eab\u7684\u81ea\u6ce8\u610f\u529b\u5c42<\/strong>\u4e0e\u6587\u672c\u8fdb\u884c\u4ea4\u4e92\uff1b\u8f93\u51fa\u7684query\u5c3a\u5bf8\u662f32*768\uff0c\u8fdc\u5c0f\u4e8e\u51bb\u7ed3\u7684\u56fe\u50cf\u7279\u5f81257*1024(ViT-L\/14)\u3002<\/li>\n\n\n\n<li>Text Transformer\uff1a\u65e2\u4f5c\u4e3a\u6587\u672c\u7f16\u7801\u5668\u4e5f\u4f5c\u4e3a\u6587\u672c\u89e3\u7801\u5668\uff0c\u5b83\u7684\u81ea\u6ce8\u610f\u529b\u5c42\u4e0eImage Transformer\u5171\u4eab\uff0c\u6839\u636e\u9884\u8bad\u7ec3\u4efb\u52a1\uff0c\u7528\u4e0d\u540c\u7684self-attention masks\u6765\u63a7\u5236Query\u548c\u6587\u672c\u7684\u4ea4\u4e92\u65b9\u5f0f\u3002<\/li>\n<\/ul>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"219\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-4.jpeg\" alt=\"\" class=\"wp-image-6360\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-4.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-4-300x91.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p><strong>\u8bad\u7ec3\u8fc7\u7a0b<\/strong>\uff1a<\/p>\n\n\n\n<p>\u4e3a\u4e86\u51cf\u5c11\u8ba1\u7b97\u6210\u672c\u5e76\u907f\u514d\u707e\u96be\u6027\u9057\u5fd8\uff0cBLIP-2 \u5728\u9884\u8bad\u7ec3\u65f6\u51bb\u7ed3\u9884\u8bad\u7ec3\u56fe\u50cf\u6a21\u578b\u548c\u8bed\u8a00\u6a21\u578b\uff0c\u7531\u4e8e\u7b80\u5355\u5730\u51bb\u7ed3\u9884\u8bad\u7ec3\u6a21\u578b\u53c2\u6570\u4f1a\u5bfc\u81f4\u89c6\u89c9\u7279\u5f81\u548c\u6587\u672c\u7279\u5f81\u96be\u4ee5\u5bf9\u9f50\uff0c\u4e3a\u6b64BLIP-2\u63d0\u51fa\u4e24\u9636\u6bb5\u9884\u8bad\u7ec3 Q-Former \u6765\u5f25\u8865modality gap\uff1a\u8868\u793a\u5b66\u4e60\u9636\u6bb5\u548c\u751f\u6210\u5b66\u4e60\u9636\u6bb5\u3002<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>\u7b2c\u4e00\u4e2a\u9884\u8bad\u7ec3\u9636\u6bb5\uff0cvision-language<strong>\u8868\u793a\u5b66\u4e60<\/strong>\uff0c\u5c06 Q-Former \u8fde\u63a5\u5230\u51bb\u7ed3\u7684\u56fe\u50cf\u7f16\u7801\u5668image encoder\uff0c\u76ee\u6807\u662fQ-Former\u5b66\u4e60\u4e0e\u6587\u672c\u6700\u76f8\u5173\u7684\u89c6\u89c9\u8868\u793a\u3002\u548cBLIP\u7c7b\u4f3c\uff0c\u901a\u8fc7\u8054\u5408\u4f18\u5316 ITC + ITG + ITM \u4e09\u4e2a\u9884\u8bad\u7ec3loss\uff0c\u5e76\u5728Query\u548cText\u4e4b\u95f4<strong>\u91c7\u7528\u4e0d\u540c\u7684\u6ce8\u610f\u529b\u63a9\u7801\u7b56\u7565\uff0c\u4ece\u800c\u63a7\u5236Image Transformer\u548cText Transformer\u7684\u4ea4\u4e92\u65b9\u5f0f<\/strong>\u3002<\/li>\n<\/ol>\n\n\n\n<ul class=\"wp-block-list\">\n<li>ITC(Image-Text Contrastive Learning)\uff1a\u4f18\u5316\u76ee\u6807\u662f\u5bf9\u9f50\u56fe\u50cf\u7279\u5f81\u548c\u6587\u672c\u7279\u5f81\uff0c\u4e5f\u5c31\u662f\u5bf9\u9f50image transformer\u8f93\u51fa\u7684query representation\u4e0e\u6765\u81eatext transformer\u8f93\u51fa\u7684text representation\u3002\u4e3a\u4e86\u907f\u514d\u4fe1\u606f\u6cc4\u6f0f\uff0cITC\u91c7\u7528\u4e86<strong>\u5355\u6a21\u6001\u81ea\u6ce8\u610f\u63a9\u7801<\/strong>\uff0c\u4e0d\u5141\u8bb8query\u548ctext\u770b\u5230\u5bf9\u65b9\u3002\u8ba1\u7b97\u65f6\u5148\u8ba1\u7b97\u6bcf\u4e2aquery\u4e0e\u6587\u672cembedding\u4e4b\u95f4\u7684\u76f8\u4f3c\u5ea6\uff0c\u7136\u540e\u9009\u62e9\u6700\u9ad8\u7684\u4f5c\u4e3a\u56fe\u6587\u76f8\u4f3c\u5ea6\u3002<\/li>\n\n\n\n<li>ITG(Image-grounded Text Generation)\uff1a\u4f18\u5316\u76ee\u6807\u662f\u7ed9\u5b9a\u8f93\u5165\u56fe\u50cf\u4f5c\u4e3a\u6761\u4ef6\uff0c\u8bad\u7ec3 Q-Former \u751f\u6210\u6587\u672c\uff0c\u8feb\u4f7fquery\u63d0\u53d6\u5305\u542b\u6240\u6709\u6587\u672c\u4fe1\u606f\u7684\u89c6\u89c9\u7279\u5f81\u3002\u7531\u4e8e Q-Former \u7684\u67b6\u6784\u4e0d\u5141\u8bb8\u51bb\u7ed3\u7684\u56fe\u50cf\u7f16\u7801\u5668\u548c\u6587\u672c\u6807\u8bb0\u4e4b\u95f4\u7684\u76f4\u63a5\u4ea4\u4e92\uff0c\u56e0\u6b64\u751f\u6210\u6587\u672c\u6240\u9700\u7684\u4fe1\u606f\u5fc5\u987b\u9996\u5148\u7531query\u63d0\u53d6\uff0c\u7136\u540e\u901a\u8fc7\u81ea\u6ce8\u610f\u529b\u5c42\u4f20\u7ed9text token\u3002ITG\u91c7\u7528<strong>\u591a\u6a21\u6001causal attention mask<\/strong>\u6765\u63a7\u5236query\u548ctext\u7684\u4ea4\u4e92\uff0cquery\u53ef\u4ee5\u76f8\u4e92\u611f\u77e5\uff0c\u4f46\u4e0d\u80fd\u770b\u89c1text token\uff0c\u6bcf\u4e2atext token\u90fd\u53ef\u4ee5\u611f\u77e5\u6240\u6709query\u53ca\u5176<strong>\u524d\u9762\u7684text\u6807\u8bb0\u3010\u534a\u77e9\u9635\uff0c\u751f\u6210\u5f0f\u4efb\u52a1\u7684\u5e38\u89c1\u505a\u6cd5\u3011<\/strong>\u3002\u8fd9\u91cc\u5c06 [CLS] \u6807\u8bb0\u66ff\u6362\u4e3a\u65b0\u7684 [DEC] \u6807\u8bb0\uff0c\u4f5c\u4e3a\u7b2c\u4e00\u4e2a\u6587\u672c\u6807\u8bb0\u6765\u6307\u793a\u89e3\u7801\u4efb\u52a1\u3002<\/li>\n\n\n\n<li>ITM( Image-Text Matching)\uff1a\u4f18\u5316\u76ee\u6807\u662f\u8fdb\u884c\u56fe\u50cf\u548c\u6587\u672c\u8868\u793a\u4e4b\u95f4\u7684\u7ec6\u7c92\u5ea6\u5bf9\u9f50\uff0c\u5b66\u4e00\u4e2a\u4e8c\u5206\u7c7b\u4efb\u52a1\uff0c\u5373\u56fe\u50cf-\u6587\u672c\u5bf9\u662f\u6b63\u5339\u914d\u8fd8\u662f\u8d1f\u5339\u914d\u3002\u8fd9\u91cc\u5c06image transformer\u8f93\u51fa\u7684\u6bcf\u4e2aquery\u5d4c\u5165\u8f93\u5165\u5230\u4e00\u4e2a\u4e8c\u7c7b\u7ebf\u6027\u5206\u7c7b\u5668\u4e2d\u4ee5\u83b7\u5f97\u5bf9\u5e94\u7684logit\uff0c\u7136\u540e\u5c06\u6240\u6709\u7684logit\u5e73\u5747\uff0c\u518d\u8ba1\u7b97\u5339\u914d\u5206\u6570\u3002ITM\u4f7f\u7528<strong>\u53cc\u5411\u81ea\u6ce8\u610f\u63a9\u7801<\/strong>\uff0c\u6240\u6709query\u548ctext\u90fd\u53ef\u4ee5\u76f8\u4e92\u611f\u77e5\u3002<\/li>\n<\/ul>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u7b2c\u4e8c\u4e2a\u9884\u8bad\u7ec3\u9636\u6bb5\uff0cvision-to-language<strong>\u751f\u6210\u5b66\u4e60<\/strong>\uff0c\u5c06 Q-Former \u8fde\u63a5\u5230\u51bb\u7ed3\u7684\u5927\u8bed\u8a00\u6a21\u578bLLM\uff0c\u5c06 Q-Former \u7684\u8f93\u51fa\u7ed9\u5230\u51bb\u7ed3\u7684 LLM \u6765\u6267\u884c\u89c6\u89c9\u5230\u8bed\u8a00\u7684\u751f\u6210\u5b66\u4e60\uff0c\u76ee\u6807\u662f\u8bad\u7ec3Q-Former\u4f7f\u5176\u8f93\u51fa\u7684\u89c6\u89c9\u8868\u793a\u5bf9LLM\u53ef\u7528\u3002<\/li>\n<\/ul>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u4f7f\u7528<strong>\u5168\u8fde\u63a5\u5c42<\/strong>\u5c06\u8f93\u51fa\u7684query embedding\u7ebf\u6027\u6295\u5f71\u5230\u4e0e LLM \u7684text embedding\u76f8\u540c\u7684\u7ef4\u5ea6\uff0c\u7136\u540e\u5c06\u6295\u5f71\u7684query embedding\u6dfb\u52a0\u5230\u8f93\u5165text embedding\u524d\u9762\u3002\u7531\u4e8e Q-Former \u5df2\u7ecf\u8fc7\u9884\u8bad\u7ec3\uff0c\u53ef\u4ee5\u63d0\u53d6\u5305\u542b\u8bed\u8a00\u4fe1\u606f\u7684\u89c6\u89c9\u8868\u793a\uff0c\u56e0\u6b64\u5b83\u53ef\u4ee5\u6709\u6548\u5730\u5145\u5f53<strong>\u4fe1\u606f\u67a2\u7ebd<\/strong>\uff0c\u5c06\u6700\u6709\u7528\u7684\u4fe1\u606f\u63d0\u4f9b\u7ed9 LLM\uff0c\u540c\u65f6\u5220\u9664\u4e0d\u76f8\u5173\u7684\u89c6\u89c9\u4fe1\u606f\uff0c\u51cf\u8f7b\u4e86 LLM \u5b66\u4e60\u89c6\u89c9\u8bed\u8a00\u5bf9\u9f50\u7684\u8d1f\u62c5\u3010\u76f8\u5f53\u4e8esoft visual prompts\u3011\u3002<\/li>\n\n\n\n<li>\u5c1d\u8bd5\u4e86decoder-based LLM \u548c encoder-decoder-based LLM\uff1a\u5bf9\u4e8edecoder-based LLM\uff0c\u57fa\u4e8elanguage modeling loss\u8fdb\u884c\u9884\u8bad\u7ec3\uff0c\u7528Q-Former\u63d0\u53d6\u7684\u89c6\u89c9\u8868\u793a\u751f\u6210\u6587\u672c\u63cf\u8ff0\uff1b\u5bf9\u4e8eencoder-decoder-based LLM\uff0c\u57fa\u4e8eprefix language modeling loss\u8fdb\u884c\u9884\u8bad\u7ec3\uff0c\u628a\u524d\u7f00\u548c\u89c6\u89c9\u8868\u793a\u4e00\u8d77\u8f93\u5165LLM encoder\uff0c\u7531LLM decoder\u751f\u6210\u540e\u7eed\u6587\u672c\u3002<\/li>\n<\/ul>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"254\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-2.jpeg\" alt=\"\" class=\"wp-image-6358\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-2.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-2-300x106.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p><br><strong>VQA\u4efb\u52a1\u5fae\u8c03\u7ed3\u6784<\/strong>\uff1a<\/p>\n\n\n\n<p>\u628a\u95ee\u9898\u548cQ-Former\u63d0\u53d6\u7684\u89c6\u89c9\u8868\u793a\u4e00\u8d77\u8f93\u5165LLM\u4ece\u800c\u5f97\u5230\u7b54\u6848\uff0c\u4e0d\u8fc7\u8fd8\u591a\u4e86\u4e00\u4e2a\u628a\u95ee\u9898\u4e5f\u52a0\u5165\u4e86Q-Former\u7684\u8f93\u5165\uff0c\u4f7f\u63d0\u53d6\u7684\u56fe\u50cf\u7279\u5f81\u548c\u95ee\u9898\u66f4\u76f8\u5173\u3002\u8fd9\u79cd\u65b9\u5f0f\u4e5f\u7528\u5728\u4e86\u4ed6\u4eec\u7684\u4e0b\u4e00\u7bc7\u6587\u7ae0InstructBLIP\u4e2d\uff0c\u7528\u4e8e\u6ce8\u5165\u6307\u4ee4\u8fdb\u884c\u5fae\u8c03\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"592\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-22.jpeg\" alt=\"\" class=\"wp-image-6379\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-22.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-22-300x247.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p><br><strong>\u6570\u636e\u96c6<\/strong>\uff1a<br>\u9884\u8bad\u7ec3\u6570\u636e\u96c6\uff1a<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>\u6cbf\u7528BLIP\u7684\u6570\u636e\u96c6\uff0c\u52a0\u8d77\u6765\u603b\u5171129M\uff1aCOCO, Visual Genome, Conceptual Captions 3M+12M, SBU Captions, \u989d\u5916\u7684 web \u6570\u636e\u96c6 LAION400M \u7684\u4e00\u90e8\u5206\uff0c\u8be5\u6570\u636e\u96c6\u5305\u542b 115M \u56fe\u50cf\uff0c\u5177\u6709\u66f4\u591a\u7684\u566a\u58f0\u6587\u672c\uff1b<\/li>\n\n\n\n<li>\u91c7\u7528\u4e86 BLIP \u91cc\u9762\u63d0\u51fa\u7684 CapFilt \u65b9\u6cd5\u4ece\u7f51\u7edc\u56fe\u50cf\u5408\u6210\u6587\u672c\u63cf\u8ff0\uff08\u9009top2)\u3002<\/li>\n<\/ol>\n\n\n\n<p>\u9884\u8bad\u7ec3\u597d\u7684Image Encoder\u548c LLM\uff1a<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>\u89c6\u89c9\u6a21\u578b\uff1aCLIP\u8bad\u7ec3\u7684 ViT-L\/14\uff1bEVA-CLIP\u8bad\u7ec3\u7684 ViT-g\/14<\/li>\n\n\n\n<li>LLM \uff1a\u57fa\u4e8e\u89e3\u7801\u5668\u7684LLM-OPT\uff1b\u57fa\u4e8e\u7f16\u89e3\u7801\u5668\u7684LLM-FlanT5<\/li>\n<\/ol>\n\n\n\n<p><strong>\u8bad\u7ec3\u8bbe\u5b9a<\/strong>\uff1a\u7b2c\u4e00\u9636\u6bb5\u9884\u8bad\u7ec3250k\u6b65\uff0c\u7b2c\u4e8c\u9636\u6bb5\u9884\u8bad\u7ec380k\u6b65\uff0c\u56fe\u50cf\u8f93\u5165\u5c3a\u5bf8224*224\uff0c\u66f4\u591a\u7ec6\u8282\u770b\u539f\u6587\u3002<\/p>\n\n\n\n<p><strong>\u6548\u679c\u5c55\u793a<\/strong>\uff1a\u53ef\u505a\u5230\u89c6\u89c9\u77e5\u8bc6\u63a8\u7406\u3001\u89c6\u89c9\u5e38\u8bc6\u63a8\u7406\u3001\u89c6\u89c9\u5bf9\u8bdd\u3001\u4e2a\u6027\u5316\u56fe\u50cf\u5230\u6587\u672c\u751f\u6210\u7b49\uff0c\u53ea\u9700\u5728\u89c6\u89c9\u63d0\u793a\u4e4b\u540e\u9644\u52a0\u6587\u672c\u63d0\u793a\u4f5c\u4e3a LLM \u7684\u8f93\u5165\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"732\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-17.jpeg\" alt=\"\" class=\"wp-image-6374\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-17.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-17-295x300.jpeg 295w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p><br><strong>\u4e0d\u8db3<\/strong>\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u4e0a\u4e0b\u6587\u5b66\u4e60\u80fd\u529b\u7f3a\u5931\uff1a\u7531\u4e8e\u9884\u8bad\u7ec3\u6570\u636e\u96c6\u4e2d\u7684\u6bcf\u4e2a\u6570\u636e\u53ea\u5305\u542b\u4e00\u4e2a\u56fe\u6587\u5bf9\uff0cLLM\u65e0\u6cd5\u5b66\u4e60\u5355\u4e2a\u5e8f\u5217\u4e2d\u591a\u4e2a\u56fe\u6587\u5bf9\u7684\u76f8\u5173\u6027\u3002<\/li>\n\n\n\n<li>\u51bb\u7ed3\u53c2\u6570\u7684 LLM \u7684\u98ce\u9669\uff1a\u6bd4\u4f8b\u8f93\u51fa\u653b\u51fb\u6027\u8bed\u8a00\uff0c\u4f20\u64ad\u793e\u4f1a\u504f\u89c1\uff0c\u89e3\u51b3\u529e\u6cd5\u662f\u6307\u4ee4\u5fae\u8c03\uff0c\u6216\u8005\u8fc7\u6ee4\u6389\u6709\u5bb3\u7684\u6570\u636e\u96c6\u3002<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\">LLaVA\uff08Large Language and Vision Assistant\uff09<\/h2>\n\n\n\n<p><br><strong>\u57fa\u672c\u601d\u60f3<\/strong>\uff1a<\/p>\n\n\n\n<p>\u4f7f\u7528\u4ec5\u9650\u8bed\u8a00\u7684GPT-4\u751f\u6210\u591a\u6a21\u6001\u8bed\u8a00\u56fe\u50cf\u6307\u4ee4\u8ddf\u968f\u6570\u636e\uff0c\u63d0\u51fa\u4e00\u79cd\u8fde\u63a5\u89c6\u89c9\u7f16\u7801\u5668\u548cLLM\u7684\u7aef\u5230\u7aef\u8bad\u7ec3\u591a\u6a21\u6001\u5927\u6a21\u578b\u3002<\/p>\n\n\n\n<p><br><strong>\u6a21\u578b\u7ed3\u6784<\/strong>\uff1a<\/p>\n\n\n\n<p>\u4f7f\u7528\u89c6\u89c9\u7f16\u7801\u5668CLIP ViT-L\/14+\u8bed\u8a00\u89e3\u7801\u5668LLaMA\u6784\u6210\u591a\u6a21\u6001\u5927\u6a21\u578b\uff0c\u7136\u540e\u4f7f\u7528\u751f\u6210\u7684\u6570\u636e\u8fdb\u884c\u6307\u4ee4\u5fae\u8c03\u3002\u8f93\u5165\u56fe\u7247X\u7ecf\u8fc7\u4e0e\u8bad\u7ec3\u597d\u7684\u89c6\u89c9\u7f16\u7801\u5668\u7684\u5230\u56fe\u7247\u7279\u5f81Z\uff0c\u56fe\u7247\u7279\u5f81Z\u7ecf\u8fc7\u4e00\u4e2a\u6620\u5c04\u77e9\u9635W\u8f6c\u5316\u4e3a\u89c6\u89c9Token H\uff0c\u8fd9\u6837Vison Token H_v\u4e0eLanguage Token H_q\u6307\u4ee4\u5c31\u90fd\u5728\u540c\u4e00\u4e2a\u7279\u5f81\u7a7a\u95f4\uff0c\u62fc\u63a5\u540e\u4e00\u8d77\u8f93\u5165\u5927\u6a21\u578b\u3002\u8fd9\u91cc\u7684\u6620\u5c04\u5c42W\u4e5f\u53ef\u4ee5\u66ff\u6362\u4e3a\u66f4\u590d\u6742\u7684\u7f51\u7edc\u6765\u63d0\u5347\u6027\u80fd\uff0c\u6bd4\u5982Flamingo\u4e2d\u7528\u7684gated cross-attentio\uff0cBLIP-2\u4e2d\u7528\u7684Q-former\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"734\" height=\"102\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-1.png\" alt=\"\" class=\"wp-image-6369\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-1.png 734w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-1-300x42.png 300w\" sizes=\"auto, (max-width: 734px) 100vw, 734px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"284\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-5.jpeg\" alt=\"\" class=\"wp-image-6361\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-5.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-5-300x118.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p><br><strong>\u8bad\u7ec3<\/strong>\uff1a<\/p>\n\n\n\n<p>\u4f7f\u7528\u5982\u4e0b\u56fe\u7684\u65b9\u5f0f\u7ec4\u7ec7\u6bcf\u4e00\u8f6e\u7684\u5bf9\u8bdd\u8f93\u5165\u8f93\u51fa\uff0c\u8bad\u7ec3\u6a21\u578b\u9884\u6d4b\u52a9\u624b\u7684\u7b54\u6848\u4ee5\u53ca\u5728\u54ea\u91cc\u505c\u6b62\uff0c\u56e0\u6b64\u4ec5\u4f7f\u7528\u7eff\u8272\u5e8f\u5217\u548c\u6807\u8bb0\u6765\u8ba1\u7b97<strong>\u81ea\u56de\u5f52\u6a21\u578b<\/strong>\u4e2d\u7684\u635f\u5931\uff0c\u5373\u6839\u636e\u6240\u6709\u524d\u8f6e\u7684\u6307\u4ee4\u548c\u56de\u7b54\u6765\u9884\u6d4b\u5f53\u524d\u76ee\u6807\u56de\u7b54X_a\uff0c\u4e5f\u5c31\u662f\u7ecf\u5178\u7684next token prediction\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"1296\" height=\"192\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image.png\" alt=\"\" class=\"wp-image-6356\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image.png 1296w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-300x44.png 300w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-1024x152.png 1024w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-768x114.png 768w\" sizes=\"auto, (max-width: 1296px) 100vw, 1296px\" \/><figcaption class=\"wp-element-caption\">\u81ea\u56de\u5f52\u635f\u5931-\u901a\u8fc7\u6240\u6709\u524d\u8f6e\u7684\u6307\u4ee4\u548c\u56de\u7b54\u6765\u9884\u6d4b\u5f53\u524d\u76ee\u6807\u56de\u7b54X_a<\/figcaption><\/figure>\n\n\n\n<p><br>\u6587\u4e2d\u4f7f\u7528\u4e86\u4e24\u9636\u6bb5\u7684\u8bad\u7ec3\u65b9\u5f0f\uff1a<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>\u9884\u8bad\u7ec3\u7279\u5f81\u5bf9\u9f50\u6a21\u5757\uff08\u6620\u5c04\u5c42W\uff09\uff0c\u51bb\u7ed3\u89c6\u89c9\u7f16\u7801\u5668\u548cLLM\uff0c\u53ea\u8bad\u7ec3<strong>\u6620\u5c04\u77e9\u9635W\u5f97\u5230\u4e0a\u9762\u516c\u5f0f\u7684\u6700\u5927\u4f3c\u7136<\/strong>\uff0c\u76f8\u5f53\u4e8e\u4e3a\u51bb\u7ed3\u7684 LLM \u8bad\u7ec3\u4e00\u4e2a\u9002\u914d\u7684visual tokenizer\u3002<\/li>\n\n\n\n<li>\u7aef\u5230\u7aef\u7684\u5fae\u8c03\u8bed\u8a00\u6a21\u578b+\u6620\u5c04\u5c42\u3002<\/li>\n<\/ol>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"246\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-18.jpeg\" alt=\"\" class=\"wp-image-6375\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-18.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-18-300x103.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p><br><strong>\u6570\u636e<\/strong>\uff1a<\/p>\n\n\n\n<p>\u4f7f\u7528ChatGPT\/GPT-4\u5c06\u6570\u636e\u8f6c\u5316\u4e3a\u591a\u6a21\u6001\u6307\u4ee4\u8ddf\u968f\u6570\u636e\uff08multimodel instrustion-following data\uff09\u3002\u5177\u4f53\u6765\u8bf4\uff0c\u4e3a\u4e86\u5c06\u8f93\u5165\u56fe\u50cf\u7f16\u7801\u4e3a\u89c6\u89c9\u7279\u5f81\u6765\u4f5c\u4e3a\u7eaf\u6587\u672c GPT\u7684(soft) promt\uff0c\u6587\u4e2d\u7528\u4e86\u4e24\u79cd\u7c7b\u578b\u7684\u8868\u8fbe\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>caption\u63cf\u8ff0\uff1a\u4ece\u4e0d\u540c\u89d2\u5ea6\u63cf\u8ff0\u89c6\u89c9\u573a\u666f\uff1b<\/li>\n\n\n\n<li>bbox\u68c0\u6d4b\u6846\uff1a\u5b9a\u4f4d\u573a\u666f\u4e2d\u7684\u5bf9\u8c61\uff0c\u6bcf\u4e2a\u6846\u5bf9\u5bf9\u8c61\u6982\u5ff5\u53ca\u5176\u7a7a\u95f4\u4f4d\u7f6e\u8fdb\u884c\u7f16\u7801\u3002<\/li>\n<\/ul>\n\n\n\n<p>\u901a\u8fc7\u8fd9\u4e24\u7c7b\u7b26\u5408\u8868\u793a\uff0c\u5c06\u89c6\u89c9\u5185\u5bb9\u4f20\u8fbe\u7ed9\u4e86\u8bed\u8a00\u5927\u6a21\u578b\uff0c\u7136\u540e\u4eba\u5de5\u8bbe\u8ba1\u4e863\u79cd\u5bf9\u8bdd\u65b9\u5f0f\uff0c\u5229\u7528GPT-4\u8fdb\u884c\u751f\u6210\u548c\u6269\u5145\uff0c\u5206\u522b\u662f\u5bf9\u8bdd\u3001\u7ec6\u8282\u63cf\u8ff0\u548c\u590d\u6742\u63a8\u7406\u3002\u6700\u540e\u603b\u5171\u6536\u96c6\u4e86158K\u72ec\u7279\u7684\u8bed\u8a00\u56fe\u50cf\u6307\u4ee4\u8ddf\u968f\u6837\u672c\uff0c\u5305\u62ec58K\u5bf9\u8bdd\u6570\u636e\u300123K\u8be6\u7ec6\u63cf\u8ff0\u6570\u636e\u300177K\u590d\u6742\u63a8\u7406\u6570\u636e\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"780\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-11.jpeg\" alt=\"\" class=\"wp-image-6367\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-11.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-11-277x300.jpeg 277w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\"><br>MiniGPT-4<\/h2>\n\n\n\n<p><br><strong>\u57fa\u672c\u601d\u60f3<\/strong>\uff1a<\/p>\n\n\n\n<p>GPT-4 \u5177\u6709\u5148\u8fdb\u7684\u591a\u6a21\u6001\u751f\u6210\u80fd\u529b\u7684\u4e3b\u8981\u539f\u56e0\u5728\u4e8e\u5229\u7528\u4e86\u66f4\u5148\u8fdb\u7684\u5927\u578b\u8bed\u8a00\u6a21\u578b\uff08LLM\uff09\uff0c\u56e0\u6b64\u63d0\u51fa\u4ec5\u7528\u4e00\u4e2a\u6295\u5f71\u5c42\u5c06\u4e00\u4e2a\u51bb\u7ed3\u7684\u89c6\u89c9\u7f16\u7801\u5668\u548c\u4e00\u4e2a\u51bb\u7ed3\u7684 LLM\uff08Vicuna\uff09\u5bf9\u9f50\u3002<\/p>\n\n\n\n<p><br><strong>\u6a21\u578b\u7ed3\u6784<\/strong>\uff1a<\/p>\n\n\n\n<p>\u7c7b\u4f3cBLIP2\uff0c\u5305\u62ec\u4e00\u4e2a\u51bb\u7ed3\u7684\u89c6\u89c9\u7f16\u7801\u5668\uff08ViT-G\/14 + Q-Former\uff09\uff0c \u4e00\u4e2a\u51bb\u7ed3\u7684 LLM\uff08Vicuna\uff09\uff0c \u4e00\u4e2a\u6295\u5f71\u5c42\u3002<\/p>\n\n\n\n<p><strong>\u4e24\u9636\u6bb5\u8bad\u7ec3<\/strong>\uff1a<\/p>\n\n\n\n<p>\u7b2c\u4e00\u9636\u6bb5\u5728\u5927\u91cf\u5bf9\u9f50\u7684\u56fe\u50cf\u6587\u672c\u5bf9\u4e0a\u5bf9\u6a21\u578b\u8fdb\u884c\u9884\u8bad\u7ec3\uff0c\u4ee5\u83b7\u53d6\u57fa\u7840\u7684\u89c6\u89c9\u8bed\u8a00\u77e5\u8bc6\u3002 \u5728\u7b2c\u4e8c\u9636\u6bb5\uff0c\u4f7f\u7528\u89c4\u6a21\u8f83\u5c0f\u4f46\u66f4\u9ad8\u8d28\u91cf\u7684\u56fe\u6587\u5bf9\u6570\u636e\u96c6\u548c\u7cbe\u5fc3\u8bbe\u8ba1\u7684\u5bf9\u8bdd\u6a21\u677f\u5bf9\u9884\u8bad\u7ec3\u6a21\u578b\u8fdb\u884c\u5fae\u8c03\uff0c\u4ee5\u589e\u5f3a\u6a21\u578b\u7684\u751f\u6210\u53ef\u9760\u6027\u548c\u53ef\u7528\u6027\u3002\u5173\u4e8eloss\u8bbe\u8ba1\u8bba\u6587\u6ca1\u6709\u8bb2\u592a\u7ec6\uff0c\u770b\u4ee3\u7801\u91cc\u9762\u5e94\u8be5\u4e3b\u8981\u662f\u8ba1\u7b97language modeling loss\uff1a <a href=\"https:\/\/link.zhihu.com\/?target=https%3A\/\/github.com\/Vision-CAIR\/MiniGPT-4\/blob\/main\/minigpt4\/models\/mini_gpt4.py%23L320\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/github.com\/Vision-CAIR\/MiniGPT-4\/blob\/main\/minigpt4\/models\/mini_gpt4.py#L320<\/a><\/p>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"537\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-7.jpeg\" alt=\"\" class=\"wp-image-6364\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-7.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-7-300x224.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\">InstructBLIP(Instruction Tuning)<\/h2>\n\n\n\n<p><br><strong>\u57fa\u672c\u601d\u60f3<\/strong>\uff1a<\/p>\n\n\n\n<p>\u57fa\u4e8e BLIP-2 \u63d0\u51fa\u6307\u4ee4\u5fae\u8c03\u7684\u8303\u5f0f\uff0c\u501f\u52a9\u989d\u5916\u7684 instruction \u63d0\u53d6\u66f4\u6709\u7528\u7684\u89c6\u89c9\u7279\u5f81\u3002<\/p>\n\n\n\n<p><strong>\u6a21\u578b\u7ed3\u6784<\/strong>\uff1a<\/p>\n\n\n\n<p>InstructBLIP \u7684\u67b6\u6784\u548c BLIP-2 \u76f8\u4f3c\uff0c\u4ece\u9884\u8bad\u7ec3\u597d\u7684 BLIP-2 \u6a21\u578b\u521d\u59cb\u5316\uff0c\u7531\u56fe\u50cf\u7f16\u7801\u5668\u3001LLM \u548c Q-Former \u7ec4\u6210\u3002\u4e3a\u4e86\u8fdb\u884c\u6307\u4ee4\u5fae\u8c03\uff0c\u5728BLIP-2\u7684\u57fa\u7840\u4e0a\u628a instruction text tokens\u4e5f\u4f5c\u4e3a\u8f93\u5165\u540c\u65f6\u7ed9\u5230Q-former\u548cLLM\u3002\u5176\u4e2d\u53ef\u5b66\u4e60\u7684K\u4e2aqueries \u901a\u8fc7Q-former\u4e2d\u5171\u4eab\u7684 self-attention \u548c\u8f93\u5165\u6307\u4ee4\u4ea4\u4e92\uff0c\u901a\u8fc7 cross-attention \u548c\u8f93\u5165\u56fe\u7247\u7684\u7279\u5f81\u4ea4\u4e92\uff0c\u9f13\u52b1\u63d0\u53d6\u4e0e\u4efb\u52a1\u76f8\u5173\u7684\u56fe\u50cf\u7279\u5f81\u3002<\/p>\n\n\n\n<p><br><strong>\u8bad\u7ec3<\/strong>\uff1a\u548cBLIP-2\u4e00\u81f4\uff0c\u5206\u4e24\u4e2a\u9636\u6bb5<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u7b2c\u4e00\u4e2avision-language\u8868\u793a\u5b66\u4e60\u9636\u6bb5\uff0c\u5c06 Q-Former \u8fde\u63a5\u5230\u51bb\u7ed3\u7684\u56fe\u50cf\u7f16\u7801\u5668image encoder\uff0c\u76ee\u6807\u662fQ-Former\u5b66\u4e60\u4e0e\u6587\u672c\u6700\u76f8\u5173\u7684\u89c6\u89c9\u8868\u793a\u3002<\/li>\n\n\n\n<li>\u7b2c\u4e8c\u4e2avision-to-language\u751f\u6210\u5b66\u4e60\u9636\u6bb5\uff0c\u5c06 Q-Former \u8fde\u63a5\u5230\u51bb\u7ed3\u7684\u5927\u8bed\u8a00\u6a21\u578bLLM\uff0c\u5c06 Q-Former \u7684\u8f93\u51fa\u7ed9\u5230\u51bb\u7ed3\u7684 LLM \u6765\u6267\u884c\u89c6\u89c9\u5230\u8bed\u8a00\u7684\u751f\u6210\u5b66\u4e60\uff0c\u76ee\u6807\u662f\u8bad\u7ec3Q-Former\u4f7f\u5176\u8f93\u51fa\u7684\u89c6\u89c9\u8868\u793a\u5bf9LLM\u53ef\u7528\u3002<\/li>\n<\/ul>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"407\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-15.jpeg\" alt=\"\" class=\"wp-image-6372\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-15.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-15-300x170.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p><br><strong>\u63a8\u7406<\/strong>\uff1a<\/p>\n\n\n\n<p>\u5206\u4e24\u79cd\u60c5\u51b5\uff0c\u5bf9\u4e8e\u5927\u90e8\u5206\u63cf\u8ff0\u6027\u4efb\u52a1\uff0c\u5982 image captioning\uff0copen-ended VQA \u7b49\uff0cInstructBLIP \u53ef\u4ee5\u76f4\u63a5\u4f7f\u7528 LLM \u751f\u6210\u7684\u6587\u672c\u4f5c\u4e3a\u8f93\u51fa\uff1b\u5bf9\u4e8e\u9009\u62e9\u6027\u4efb\u52a1\uff0c\u5982 classification \u548c multi-choice VQA \uff0c\u53c2\u8003vocabulary ranking method\uff0c\u5c06LLM\u751f\u6210\u7684\u5185\u5bb9\u8bcd\u6c47\u9650\u5236\u4e3a<strong>\u5019\u9009\u5217\u8868\u8fdb\u884c\u6392\u5e8f<\/strong>\uff0c\u8ba1\u7b97\u6bcf\u4e2a\u5019\u9009\u7684\u5bf9\u6570\u4f3c\u7136\uff0c\u9009\u62e9\u6700\u9ad8\u503c\u7684\u4e00\u4e2a\u4f5c\u4e3a\u6700\u7ec8\u9884\u6d4b\u3002<\/p>\n\n\n\n<p><br><strong>\u6570\u636e\u96c6<\/strong>\uff1a<\/p>\n\n\n\n<p>\u4f5c\u8005\u5c0611\u7c7b\u4efb\u52a1\u768426\u4e2a\u6570\u636e\u96c6\u8f6c\u5316\u6210\u6307\u4ee4\u5fae\u8c03\u7684\u683c\u5f0f\uff0c\u628a\u5b83\u4eec\u5206\u621013\u4e2a held-in \u6570\u636e\u96c6\u7528\u4e8e\u6307\u4ee4\u5fae\u8c03\uff0c\u548c13\u4e2a held-out \u6570\u636e\u96c6\u7528\u4e8e Zero-Shot \u80fd\u529b\u7684\u8bc4\u4f30\u3002held-out\u5206\u4e24\u79cd\uff0c\u4e00\u79cd\u662f\u8fd9\u4e9b\u6570\u636e\u5728\u8bad\u7ec3\u671f\u95f4\u6ca1\u88ab\u6a21\u578b\u89c1\u8fc7\uff0c\u4f46\u662f\u540c\u7c7b\u578b\u7684\u4efb\u52a1\u4e2d\u5b66\u8fc7\uff0c\u53e6\u4e00\u79cd\u662f\u5728\u8bad\u7ec3\u548c\u540c\u7c7b\u578b\u7684\u4efb\u52a1\u90fd\u4e0d\u6d89\u53ca\uff08\u6bd4\u5982\u4e0b\u56fe\u4e2d\u5168\u4e3a\u767d\u6846\u76844\u4e2a\u4efb\u52a1\uff09\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"450\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-23.jpeg\" alt=\"\" class=\"wp-image-6380\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-23.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-23-300x188.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p><br><strong>\u91c7\u6837\u7b56\u7565<\/strong>\uff1a<\/p>\n\n\n\n<p>\u5728\u8bad\u7ec3\u671f\u95f4\uff0c\u6240\u6709\u7684 held-in \u6570\u636e\u96c6\u7684\u8bad\u7ec3\u96c6\u6df7\u5408\uff0c\u603b\u6570\u91cf\u592a\u5927\u4e14\u6bcf\u4e2a\u6570\u636e\u96c6\u7684\u89c4\u6a21\u5b58\u5728\u663e\u7740\u5dee\u5f02\uff0c\u5747\u5300\u6df7\u5408\u4f1a\u5bfc\u81f4\u6a21\u578b\u8fc7\u62df\u5408\u8f83\u5c0f\u7684\u6570\u636e\u96c6\uff0c\u6b20\u62df\u5408\u8f83\u5927\u7684\u6570\u636e\u96c6\u3002\u56e0\u6b64\uff0c\u6587\u4e2d\u63d0\u51fa\u6839\u636e<strong>\u6570\u636e\u96c6\u7684\u5927\u5c0f\u7684\u5e73\u65b9\u6839<\/strong>\u6216\u8005\u5176\u4ed6\u6b63\u76f8\u5173\u6bd4\u4f8b\u8fdb\u884c\u91c7\u6837\u3002\u66f4\u591a\u8bad\u7ec3\u7ec6\u8282\u89c1\u539f\u6587\u3002<\/p>\n\n\n\n<p><br><strong>\u6307\u4ee4\u6a21\u677f<\/strong>\uff1a<\/p>\n\n\n\n<p>\u5bf9\u4e8e\u6bcf\u4e2a\u4efb\u52a1\uff0c\u4eba\u5de5\u5236\u4f5c\u4e8610-15\u4e2a\u9ad8\u8d28\u91cf\u7684\u81ea\u7136\u8bed\u8a00\u6307\u4ee4\u6a21\u677f\u3002\u5bf9\u4e8e\u4e00\u4e9b\u504f\u7231\u7b80\u6d01\u54cd\u5e94\u7684\u6570\u636e\u96c6\uff0c\u5728 instruction \u4e2d\u589e\u52a0\u4e86 &#8220;short&#8221;, &#8220;briefly&#8221; \u6765\u51cf\u5c0f\u6a21\u578b\u8fc7\u62df\u5408\u7684\u98ce\u9669\uff0c\u9632\u6b62\u6a21\u578b\u59cb\u7ec8\u751f\u6210\u5f88\u77ed\u7684\u8f93\u51fa\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"577\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-19.jpeg\" alt=\"\" class=\"wp-image-6376\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-19.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-19-300x240.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\">MME\u591a\u6a21\u6001\u5927\u6a21\u578b\u8bc4\u4f30\u4f53\u7cfb<\/h2>\n\n\n\n<p><br>\u94fe\u63a5\uff1a <a href=\"https:\/\/link.zhihu.com\/?target=https%3A\/\/github.com\/BradyFU\/Awesome-Multimodal-Large-Language-Models\/tree\/Evaluation\" target=\"_blank\" rel=\"noreferrer noopener\">A Comprehensive Evaluation Benchmark for Multimodal Large Language Models<\/a><\/p>\n\n\n\n<p><br><strong>\u57fa\u672c\u601d\u8def<\/strong>\uff1a<\/p>\n\n\n\n<p><br>\u9488\u5bf9\u591a\u6a21\u6001\u5927\u6a21\u578b\u63d0\u51fa\u8bc4\u4f30\u57fa\u51c6MME\uff0c\u572814\u4e2a\u5b50\u4efb\u52a1\u4e0a\u8bc4\u4f30\u4e86\u76ee\u524d\u6bd4\u8f83\u524d\u6cbf\u768412\u4e2a\u5927\u6a21\u578b\u7684\u611f\u77e5\u548c\u8ba4\u77e5\u80fd\u529b\u3002<\/p>\n\n\n\n<p><strong>\u901a\u7528\u7684\u5168\u9762\u8bc4\u4f30\u57fa\u51c6\u5e94\u5177\u5907\u4ee5\u4e0b\u56db\u4e2a\u7279\u70b9<\/strong>\uff1a<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>\u6db5\u76d6\u4e0d\u540c\u5c42\u6b21\u7684\u80fd\u529b\uff0c\u5305\u62ec\u611f\u77e5\u548c\u8ba4\u77e5\u80fd\u529b\u3002\u524d\u8005\u6307\u8bc6\u522b\u7279\u5b9a\u5bf9\u8c61\uff0c\u5982\u5bf9\u8c61\u7684\u5b58\u5728\u3001\u6570\u91cf\u3001\u4f4d\u7f6e\u548c\u989c\u8272\u3002\u540e\u8005\u6307\u901a\u8fc7\u7ec4\u5408LLM\u4e2d\u7684\u611f\u77e5\u4fe1\u606f\u548c\u77e5\u8bc6\u63a8\u5bfc\u51fa\u66f4\u590d\u6742\u7684\u7b54\u6848\uff08\u524d\u8005\u662f\u540e\u8005\u7684\u524d\u63d0\uff09\u3002<\/li>\n\n\n\n<li>\u6570\u636e\u6216\u6ce8\u91ca\u5e94\u5c3d\u53ef\u80fd\u4e0d\u6765\u81ea\u73b0\u6709\u7684\u516c\u5f00\u53ef\u7528\u6570\u636e\u96c6\uff0c\u4ee5\u907f\u514d\u6570\u636e\u6cc4\u6f0f\u7684\u98ce\u9669\u3002<\/li>\n\n\n\n<li>\u6307\u4ee4\u5e94\u5c3d\u53ef\u80fd\u7b80\u660e\u627c\u8981\uff0c\u5e76\u7b26\u5408\u4eba\u7c7b\u7684\u8ba4\u77e5\u3002\u6240\u6709\u6a21\u578b\u5e94\u5728\u76f8\u540c\u7edf\u4e00\u7684\u6307\u4ee4\u4e0b\u8fdb\u884c\u6d4b\u8bd5\uff0c\u4ee5\u8fdb\u884c\u516c\u5e73\u6bd4\u8f83\uff0c\u4e00\u4e2a\u4f18\u79c0\u7684MLLM\u4e5f\u5e94\u80fd\u591f\u63a8\u5e7f\u5230\u8fd9\u79cd\u7b80\u660e\u627c\u8981\u7684\u6307\u4ee4\u3002<\/li>\n\n\n\n<li>MLLM\u5bf9\u6307\u4ee4\u7684\u54cd\u5e94\u5e94\u76f4\u89c2\u4e14\u4fbf\u4e8e\u5b9a\u91cf\u5206\u6790\u3002<\/li>\n<\/ol>\n\n\n\n<p><strong>\u5bf9\u5e94\u7684\u8bbe\u8ba1<\/strong>\uff1a<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>MME\u6db5\u76d6\u4e86\u611f\u77e5\u548c\u8ba4\u77e5\u80fd\u529b\u7684\u8003\u5bdf\uff0c\u603b\u5171\u670914\u4e2a\u5b50\u4efb\u52a1\uff1a\n<ol class=\"wp-block-list\">\n<li>\u611f\u77e5\u80fd\u529b\uff1aOCR\u80fd\u529b\u548c\u7c97\u7c92\u5ea6\u8bc6\u522b\u3001\u7ec6\u7c92\u5ea6\u8bc6\u522b\uff0c\u524d\u8005\u8bc6\u522b\u5bf9\u8c61\u7684\u5b58\u5728\u3001\u6570\u91cf\u3001\u4f4d\u7f6e\u548c\u989c\u8272\uff0c\u540e\u8005\u8bc6\u522b\u7535\u5f71\u6d77\u62a5\u3001\u540d\u4eba\u3001\u573a\u666f\u3001\u5730\u6807\u548c\u827a\u672f\u4f5c\u54c1\u3002<\/li>\n\n\n\n<li>\u8ba4\u77e5\u80fd\u529b\uff1a\u5305\u62ec\u5e38\u8bc6\u63a8\u7406\u3001\u6570\u503c\u8ba1\u7b97\u3001\u6587\u672c\u7ffb\u8bd1\u548c\u4ee3\u7801\u63a8\u7406\u3002<\/li>\n<\/ol>\n<\/li>\n\n\n\n<li>\u6240\u6709\u7684\u6307\u4ee4-\u7b54\u6848\u5bf9\u90fd\u662f\u4eba\u5de5\u6784\u5efa\u7684\u3002\u5bf9\u4e8e\u6d89\u53ca\u5230\u7684\u5c11\u6570\u516c\u5171\u6570\u636e\u96c6\uff0c\u4e5f\u4ec5\u4f7f\u7528\u56fe\u50cf\uff0c\u800c\u4e0d\u76f4\u63a5\u4f9d\u8d56\u4e8e\u5176\u539f\u59cb\u6ce8\u91ca\u3002\u540c\u65f6\uff0c\u6211\u4eec\u52aa\u529b\u901a\u8fc7\u771f\u5b9e\u7167\u7247\u548c\u56fe\u50cf\u751f\u6210\u6765\u6536\u96c6\u6570\u636e\u3002<\/li>\n\n\n\n<li>MME\u7684\u6307\u4ee4\u8bbe\u8ba1\u7b80\u660e\u627c\u8981\uff0c\u4ee5\u907f\u514d\u5bf9\u6a21\u578b\u8f93\u51fa\u7684\u5f71\u54cd\uff0c\u57fa\u672c\u5047\u8bbe\u662f\u4e00\u4e2a\u597d\u7684MLLM\u5e94\u80fd\u591f\u9002\u914d\u7b80\u5355\u4e14\u5e38\u7528\u7684\u6307\u4ee4\u3002<\/li>\n\n\n\n<li>\u6839\u636eMLLM\u7684\u8f93\u51fa\u51c6\u786e&amp;\u5ba2\u89c2\uff0c\u4e3b\u8981\u662f\u4ee5&#8221;\u662f&#8221;&#8221;\u5426&#8221;\u95ee\u7b54\uff0c\u4fbf\u4e8e\u8fdb\u884c\u5b9a\u91cf\u7edf\u8ba1\u3002\u9700\u8981\u6ce8\u610f\u7684\u662f\uff0c\u6587\u4e2d\u8fd8\u5c1d\u8bd5\u8bbe\u8ba1\u4e86\u5e26\u6709\u591a\u9879\u9009\u62e9\u9898\u7684\u6307\u4ee4\uff0c\u4f46\u53d1\u73b0\u5f53\u524d\u7684MLLM\u53ef\u80fd\u65e0\u6cd5\u9075\u5faa\u590d\u6742\u7684\u6307\u4ee4\u3002<\/li>\n<\/ol>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"612\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-14.jpeg\" alt=\"\" class=\"wp-image-6371\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-14.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-14-300x255.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p><br>\u572812\u4e2a\u5f53\u524d\u6bd4\u8f83SOTA\u7684\u65b9\u6cd5\u4e0a\u7684\u8bc4\u4f30\u7ed3\u679c\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"602\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-8.jpeg\" alt=\"\" class=\"wp-image-6363\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-8.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-8-300x251.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<p><br><strong>\u7ed3\u8bba<\/strong>\uff1a\u7efc\u5408\u611f\u77e5\u80fd\u529b\u548c\u8ba4\u77e5\u80fd\u529b\uff0cBLIP2\u3001mPLUG-Owl\u3001InstructionBLIP\u3001Mini-GPT4 \u603b\u4f53\u8868\u73b0\u66f4\u4f18\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"325\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-16.jpeg\" alt=\"\" class=\"wp-image-6373\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-16.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-16-300x135.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n\n\n\n<figure class=\"wp-block-image\"><img loading=\"lazy\" decoding=\"async\" width=\"720\" height=\"605\" src=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-9.jpeg\" alt=\"\" class=\"wp-image-6365\" srcset=\"https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-9.jpeg 720w, https:\/\/blog.deephour.com\/wp-content\/uploads\/2024\/04\/image-9-300x252.jpeg 300w\" sizes=\"auto, (max-width: 720px) 100vw, 720px\" \/><\/figure>\n","protected":false},"excerpt":{"rendered":"<p>\u4f5c\u8005\uff1a\u69b4\u83b2\u9165\u94fe\u63a5\uff1ahttps:\/\/zhuanlan.zhihu.com\/p\/653902791\u6765\u6e90\uff1a\u77e5\u4e4e\u8457\u4f5c\u6743&hellip;&nbsp;<a href=\"https:\/\/blog.deephour.com\/index.php\/2024\/04\/02\/%e5%a4%9a%e6%a8%a1%e6%80%81%e5%a4%a7%e6%a8%a1%e5%9e%8b-clip-blip-blip2-llava-minigpt4-instructblip-%e7%b3%bb%e5%88%97%e8%a7%a3%e8%af%bb%e8%bd%ac%e8%bd%bd\/\" rel=\"bookmark\">\u9605\u8bfb\u66f4\u591a &raquo;<span class=\"screen-reader-text\">\u591a\u6a21\u6001\u5927\u6a21\u578b CLIP, BLIP, BLIP2, LLaVA, miniGPT4, InstructBLIP \u7cfb\u5217\u89e3\u8bfb(\u8f6c\u8f7d)<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":6354,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"neve_meta_sidebar":"","neve_meta_container":"","neve_meta_enable_content_width":"","neve_meta_content_width":0,"neve_meta_title_alignment":"","neve_meta_author_avatar":"","neve_post_elements_order":"","neve_meta_disable_header":"","neve_meta_disable_footer":"","neve_meta_disable_title":"","footnotes":""},"categories":[53],"tags":[54,55],"class_list":["post-6353","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-ai_tech-2","tag-54","tag-55"],"_links":{"self":[{"href":"https:\/\/blog.deephour.com\/index.php\/wp-json\/wp\/v2\/posts\/6353","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/blog.deephour.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/blog.deephour.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/blog.deephour.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/blog.deephour.com\/index.php\/wp-json\/wp\/v2\/comments?post=6353"}],"version-history":[{"count":1,"href":"https:\/\/blog.deephour.com\/index.php\/wp-json\/wp\/v2\/posts\/6353\/revisions"}],"predecessor-version":[{"id":6382,"href":"https:\/\/blog.deephour.com\/index.php\/wp-json\/wp\/v2\/posts\/6353\/revisions\/6382"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/blog.deephour.com\/index.php\/wp-json\/wp\/v2\/media\/6354"}],"wp:attachment":[{"href":"https:\/\/blog.deephour.com\/index.php\/wp-json\/wp\/v2\/media?parent=6353"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/blog.deephour.com\/index.php\/wp-json\/wp\/v2\/categories?post=6353"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/blog.deephour.com\/index.php\/wp-json\/wp\/v2\/tags?post=6353"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}