[ { "slug": "llava", "num": 1, "title": "LLaVA: Visual Instruction Tuning", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "founder", "year": 2023, "venue": "NeurIPS", "difficulty": 2, "tldr": "给一个只会打字聊天的 AI 装上眼睛——你随手拍张照片发过去，它能看着图陪你说话。", "wordCount": 6046, "readingMinutes": 17, "tags": [ "language", "vision", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/llava/", "sourcePath": "papers/llava/paper.pdf", "status": "auto-summary" }, { "slug": "3dshape2vecset", "num": 2, "title": "3DShape2VecSet: 3D Shape Representation for Diffusion Models", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "classic", "year": 2023, "venue": "SIGGRAPH", "difficulty": 4, "tldr": "把一只 3D 柯基拆成 512 张小卡片；电脑学会卡片的规律，就能凭空造出新的 3D 模型。", "wordCount": 6224, "readingMinutes": 18, "tags": [ "diffusion", "3D" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/3dshape2vecset/", "sourcePath": "papers/3dshape2vecset/paper.pdf", "status": "auto-summary" }, { "slug": "saycan", "num": 3, "title": "SayCan: Do As I Can, Not As I Say", "topic": "planning", "topicLabel": "High-Level Planning", "era": "founder", "year": 2022, "venue": "CoRL", "difficulty": 2, "tldr": "让\"见多识广但出不了门的 AI\"出主意，让机器人自己摸口袋说\"这事我现在能做\"，两边都点头才动手。", "wordCount": 4946, "readingMinutes": 14, "tags": [ "language", "RL" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/saycan/", "sourcePath": "papers/saycan/paper.pdf", "status": "auto-summary" }, { "slug": "openvla", "num": 4, "title": "OpenVLA: An Open-Source Vision-Language-Action Model", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "classic", "year": 2024, "venue": "CoRL", "difficulty": 3, "tldr": "把一个会\"看图说话\"的 AI 改一改，让它学会\"看一眼桌面就动手摆东西\"，再把全部训练配方开源送出去。", "wordCount": 5383, "readingMinutes": 15, "tags": [ "language", "vision", "VLA", "open-source" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/openvla/", "sourcePath": "papers/openvla/paper.pdf", "status": "auto-summary" }, { "slug": "vlas", "num": 5, "title": "VLAS: VLA Model With Speech Instructions", "topic": "multimodal", "topicLabel": "Multimodal Ecology", "era": "frontier", "year": 2025, "venue": "ICLR", "difficulty": 3, "tldr": "机器人直接听原声干活：光凭你的嗓音就认出\"是你在说话\"，再去拿你那只专属的杯子。", "wordCount": 5570, "readingMinutes": 16, "tags": [ "language", "audio-speech", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/vlas/", "sourcePath": "papers/vlas/paper.pdf", "status": "auto-summary" }, { "slug": "mla", "num": 6, "title": "MLA: Multisensory Language-Action Model", "topic": "multimodal", "topicLabel": "Multimodal Ecology", "era": "frontier", "year": 2024, "venue": "arXiv", "difficulty": 4, "tldr": "让机器人不只用眼睛看，还会用\"手感\"和\"空间感\"，并且提前猜下一秒发生什么再动手。", "wordCount": 5636, "readingMinutes": 16, "tags": [ "3D", "language", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/mla/", "sourcePath": "papers/mla/paper.pdf", "status": "auto-summary" }, { "slug": "cosmos-policy", "num": 7, "title": "Cosmos Policy: Fine-Tuning Video Models for Visuomotor Control", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "frontier", "year": 2025, "venue": "arXiv", "difficulty": 5, "tldr": "把一个会\"脑补下一秒视频\"的大模型，再练一遍，就能让它指挥机械臂做家务。", "wordCount": 5888, "readingMinutes": 17, "tags": [ "diffusion", "world-model", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/cosmos-policy/", "sourcePath": "papers/cosmos-policy/paper.pdf", "status": "auto-summary" }, { "slug": "rf-slam", "num": 8, "title": "CartoRadar: RF-Based 3D SLAM Rivaling Vision Approaches", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "classic", "year": 2023, "venue": "MobiCom 2025 (Best Artifact Award)", "difficulty": 4, "tldr": "给机器人装一颗几百块的小雷达，哪怕屋里又黑又有烟，它也能一边走一边画出准的 3D 地图，比用相机还清楚。", "wordCount": 6567, "readingMinutes": 19, "tags": [ "3D", "RF-radar", "navigation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rf-slam/", "sourcePath": "papers/rf-slam/paper.pdf", "status": "auto-summary" }, { "slug": "mmclip", "num": 9, "title": "mmCLIP: Boosting mmWave-based Zero-shot HAR via Signal-Text Alignment", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "classic", "year": 2024, "venue": "SenSys 2024", "difficulty": 4, "tldr": "教一种\"看不见脸\"的小盒子雷达，没学过的新动作也能猜个八九不离十——比如老人半夜在黑卧室摔倒，它能感知到。", "wordCount": 6251, "readingMinutes": 18, "tags": [ "language", "vision", "RF-radar", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/mmclip/", "sourcePath": "papers/mmclip/paper.pdf", "status": "auto-summary" }, { "slug": "nlos-mmwave", "num": 10, "title": "mmNorm: Non-Line-of-Sight 3D Object Reconstruction via mmWave Surface Normal Estimation", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "frontier", "year": 2023, "venue": "MobiSys 2025", "difficulty": 4, "tldr": "不直接问\"东西在哪儿\"，而是先猜\"它的皮朝哪边翘\"——雷达就能隔着纸箱看出里面是什么形状。", "wordCount": 5693, "readingMinutes": 16, "tags": [ "3D", "RF-radar" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/nlos-mmwave/", "sourcePath": "papers/nlos-mmwave/paper.pdf", "status": "auto-summary" }, { "slug": "proactive-hearing", "num": 11, "title": "Proactive Hearing Assistants that Isolate Egocentric Conversations", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "frontier", "year": 2024, "venue": "UIST", "difficulty": 3, "tldr": "戴上这副耳机，它自己听出\"现在你在跟谁聊天\"，把同伴的声音放大、其他人压下去，你一个按钮都不用按。", "wordCount": 5847, "readingMinutes": 17, "tags": [ "audio-speech" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/proactive-hearing/", "sourcePath": "papers/proactive-hearing/paper.pdf", "status": "auto-summary" }, { "slug": "neuralaids", "num": 12, "title": "NeuralAids: Wireless Hearables With Programmable Speech AI Accelerators", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "classic", "year": 2024, "venue": "MobiCom", "difficulty": 3, "tldr": "在咖啡馆听不清对面说话？让助听器自己降噪，不连手机、不连云。", "wordCount": 5527, "readingMinutes": 16, "tags": [ "audio-speech" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/neuralaids/", "sourcePath": "papers/neuralaids/paper.pdf", "status": "auto-summary" }, { "slug": "acoustic-swarms", "num": 13, "title": "Creating speech zones with self-distributing acoustic swarms", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "founder", "year": 2023, "venue": "Nature", "difficulty": 3, "tldr": "七个像骰子那么大的小机器人，自己爬上桌散成一圈，桌上几个人同时讲话，它能分清谁说了啥。", "wordCount": 6653, "readingMinutes": 19, "tags": [ "audio-speech", "navigation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/acoustic-swarms/", "sourcePath": "papers/acoustic-swarms/paper.md", "status": "auto-summary" }, { "slug": "conv-tasnet", "num": 14, "title": "Conv-TasNet: Surpassing Ideal Time-Frequency Magnitude Masking for Speech Separation", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "founder", "year": 2019, "venue": "IEEE/ACM TASLP", "difficulty": 3, "tldr": "两人同时讲话的混音，喂给一个网络，它能把每个人的声音分别还原。比老方法（看频谱图）更准、更快、更小。", "wordCount": 6834, "readingMinutes": 20, "tags": [ "audio-speech" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/conv-tasnet/", "sourcePath": "papers/conv-tasnet/paper.pdf", "status": "auto-summary" }, { "slug": "soundstream", "num": 15, "title": "SoundStream: An End-to-End Neural Audio Codec", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "founder", "year": 2022, "venue": "IEEE/ACM TASLP", "difficulty": 4, "tldr": "让 AI 自己学怎么把声音\"打包又拆开\"，3 kbps 的小包听起来反而比传统方案 12 kbps 还清楚。", "wordCount": 6276, "readingMinutes": 18, "tags": [ "audio-speech" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/soundstream/", "sourcePath": "papers/soundstream/paper.pdf", "status": "auto-summary" }, { "slug": "audiolm", "num": 16, "title": "AudioLM", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "classic", "year": 2023, "venue": "TASLP", "difficulty": 4, "tldr": "把声音切成两种\"音频字\"——一种管说啥、一种管音色，模型像写句子一样续写，给 3 秒就能接出像本人的语音。", "wordCount": 2976, "readingMinutes": 9, "tags": [ "transformer", "language", "audio-speech" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/audiolm/", "sourcePath": "https://arxiv.org/abs/2209.03143", "status": "auto-summary-light" }, { "slug": "conformer", "num": 17, "title": "Conformer", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "classic", "year": 2020, "venue": "Interspeech", "difficulty": 3, "tldr": "让 AI 听人说话时既能听清每个字的咬字，又能联系整段话的意思——一个会同时\"听细节\"和\"听大意\"的耳朵。", "wordCount": 2209, "readingMinutes": 6, "tags": [ "transformer", "vision", "audio-speech" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/conformer/", "sourcePath": "https://arxiv.org/abs/2005.08100", "status": "auto-summary-light" }, { "slug": "dprnn", "num": 18, "title": "Dual-path RNN", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "classic", "year": 2020, "venue": "ICASSP", "difficulty": 4, "tldr": "DPRNN 把超长录音切成小块，让 RNN 先在块里跑、再跨块跑，交替几轮就能把两个人同时说话拆开。", "wordCount": 2615, "readingMinutes": 7, "tags": [ "transformer", "audio-speech", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dprnn/", "sourcePath": "https://arxiv.org/abs/1910.06379", "status": "auto-summary-light" }, { "slug": "encodec", "num": 19, "title": "EnCodec", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "classic", "year": 2023, "venue": "TMLR", "difficulty": 4, "tldr": "EnCodec 把声音压成一串很小的数字再还原回来；既比老办法省流量，又因为是数字，AI 可以像写字一样\"写\"出声音。", "wordCount": 2701, "readingMinutes": 8, "tags": [ "transformer", "language", "audio-speech" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/encodec/", "sourcePath": "https://arxiv.org/abs/2210.13438", "status": "auto-summary-light" }, { "slug": "meta-stylespeech", "num": 20, "title": "Meta-StyleSpeech", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "classic", "year": 2021, "venue": "ICML", "difficulty": 3, "tldr": "给模型听几秒陌生人说话的录音，它就能用这个人的声音念任意一句话。不用重新训练、不用收集几小时数据——几秒就够。", "wordCount": 2329, "readingMinutes": 7, "tags": [ "transformer", "audio-speech", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/meta-stylespeech/", "sourcePath": "https://arxiv.org/abs/2106.03153", "status": "auto-summary-light" }, { "slug": "musiclm", "num": 21, "title": "MusicLM", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "classic", "year": 2023, "venue": "arXiv", "difficulty": 4, "tldr": "对着模型说一句\"缓慢爵士钢琴配鼓刷\"，它就生成几分钟真实音乐——先定骨架（结构），再填细节（音色）。", "wordCount": 2524, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "vision", "audio-speech", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/musiclm/", "sourcePath": "https://arxiv.org/abs/2301.11325", "status": "auto-summary-light" }, { "slug": "whisper", "num": 22, "title": "Robust Speech Recognition via Large-Scale Weak Supervision", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "classic", "year": 2023, "venue": "ICML", "difficulty": 3, "tldr": "Whisper 把网上 68 万小时音频和字幕一锅烩，喂进普通 Transformer，开箱就能听各种口音、噪声和长录音，还顺手翻译——靠数据杂取胜。", "wordCount": 7515, "readingMinutes": 21, "tags": [ "transformer", "language", "vision", "audio-speech" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/whisper/", "sourcePath": "papers/whisper/paper.pdf", "status": "auto-summary" }, { "slug": "seamless-m4t", "num": 23, "title": "SeamlessM4T", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "frontier", "year": 2023, "venue": "arXiv", "difficulty": 4, "tldr": "*一个模型搞定 100 种语言的\"听懂、翻译、说出来\"，省掉以前三四个 App 接力的麻烦。它一口气会做 5 件事，名字像缩写但其实只是\"输入 → 输出\"的简写： ASR（Automatic Speech Recognition，语音识别）：听写成同语言文字 S2T（Speec", "wordCount": 2443, "readingMinutes": 7, "tags": [ "transformer", "language", "audio-speech" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/seamless-m4t/", "sourcePath": "https://arxiv.org/abs/2308.11596", "status": "auto-summary-light" }, { "slug": "stable-audio", "num": 24, "title": "Stable Audio", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "frontier", "year": 2024, "venue": "ICML", "difficulty": 4, "tldr": "打几个字描述你想要的声音，AI 就能做出几十秒到一两分钟的高音质音乐或音效，长度还能精确到秒。", "wordCount": 2376, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "vision", "audio-speech", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/stable-audio/", "sourcePath": "https://arxiv.org/abs/2402.04825", "status": "auto-summary-light" }, { "slug": "uss-weakly-labelled", "num": 25, "title": "Universal Source Separation with Weakly Labelled Data", "topic": "auditory", "topicLabel": "Auditory & Acoustic", "era": "frontier", "year": 2024, "venue": "TASLP", "difficulty": 4, "tldr": "给电脑一段嘈杂录音，告诉它\"我只要狗叫\"，它就把狗叫从混音里抠出来。一个模型覆盖 527 类日常声音。", "wordCount": 2366, "readingMinutes": 7, "tags": [ "vision", "audio-speech", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/uss-weakly-labelled/", "sourcePath": "https://arxiv.org/abs/2305.07447", "status": "auto-summary-light" }, { "slug": "meta-world", "num": 26, "title": "Meta-World: A Benchmark and Evaluation for Multi-Task and Meta Reinforcement Learning", "topic": "dataset-eval", "topicLabel": "Datasets & Benchmarks", "era": "founder", "year": 2019, "venue": "CoRL", "difficulty": 2, "tldr": "给那些号称\"会举一反三\"的机器人算法办一场 50 道动手题的统一考试，看它们是不是真的会。", "wordCount": 5943, "readingMinutes": 17, "tags": [ "manipulation", "navigation", "RL", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/meta-world/", "sourcePath": "papers/meta-world/paper.pdf", "status": "auto-summary" }, { "slug": "rlbench", "num": 27, "title": "RLBench: The Robot Learning Benchmark & Learning Environment", "topic": "dataset-eval", "topicLabel": "Datasets & Benchmarks", "era": "founder", "year": 2019, "venue": "RA-L", "difficulty": 2, "tldr": "给机器人手臂出了一套 100 道题的\"统考卷\"，从此大家都做同一套题，第一次能公平比谁更厉害。", "wordCount": 7554, "readingMinutes": 22, "tags": [ "vision", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rlbench/", "sourcePath": "papers/rlbench/paper.pdf", "status": "auto-summary" }, { "slug": "robosuite", "num": 28, "title": "robosuite: A Modular Simulation Framework and Benchmark for Robot Learning", "topic": "dataset-eval", "topicLabel": "Datasets & Benchmarks", "era": "founder", "year": 2020, "venue": "arXiv", "difficulty": 2, "tldr": "robosuite 是机器人 AI 的\"标准考场\"——同一台仿真机械臂、同一组题目，让全球研究者公平地比谁的算法更聪明。", "wordCount": 7448, "readingMinutes": 21, "tags": [ "manipulation", "RL", "imitation", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/robosuite/", "sourcePath": "papers/robosuite/paper.pdf", "status": "auto-summary" }, { "slug": "bridgedata-v2", "num": 29, "title": "BridgeData V2", "topic": "dataset-eval", "topicLabel": "Datasets & Benchmarks", "era": "classic", "year": 2023, "venue": "dataset-eval", "difficulty": 2, "tldr": "BridgeData V2 是一份公开的\"机器人干活录像库\"——6 万段机械臂在 24 个真实场景里的演示视频，大家训机器人时把它当共同起跑线。", "wordCount": 2371, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "vision", "imitation", "VLA", "sim2real" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/bridgedata-v2/", "sourcePath": "https://arxiv.org/abs/2308.12952", "status": "auto-summary-light" }, { "slug": "calvin", "num": 30, "title": "CALVIN", "topic": "dataset-eval", "topicLabel": "Datasets & Benchmarks", "era": "classic", "year": 2022, "venue": "RA-L", "difficulty": 3, "tldr": "CALVIN 是一把\"机器人听话考试\"的尺子：人说一段话，机器人要在桌上一步接一步把活干完，34 个小任务统一打分。", "wordCount": 2327, "readingMinutes": 7, "tags": [ "vision", "manipulation", "imitation", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/calvin/", "sourcePath": "https://arxiv.org/abs/2112.03227", "status": "auto-summary-light" }, { "slug": "libero", "num": 31, "title": "LIBERO", "topic": "dataset-eval", "topicLabel": "Datasets & Benchmarks", "era": "classic", "year": 2023, "venue": "NeurIPS", "difficulty": 3, "tldr": "教机器人学新技能时别忘旧技能。LIBERO 是这事的标准考卷，4 套题分别考空间、物体、目标和综合。", "wordCount": 2326, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "vision", "imitation", "VLA", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/libero/", "sourcePath": "https://arxiv.org/abs/2306.03310", "status": "auto-summary-light" }, { "slug": "rh20t", "num": 32, "title": "RH20T", "topic": "dataset-eval", "topicLabel": "Datasets & Benchmarks", "era": "classic", "year": 2023, "venue": "RSS Workshop", "difficulty": 3, "tldr": "机器人数据集，除拍视频外还录了\"手感\"和\"声音\"：拧瓶盖多大力、咔哒卡到位。147 项任务、11 万段。", "wordCount": 2079, "readingMinutes": 6, "tags": [ "diffusion", "vision", "manipulation", "imitation", "sim2real", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rh20t/", "sourcePath": "https://arxiv.org/abs/2307.00595", "status": "auto-summary-light" }, { "slug": "robomimic", "num": 33, "title": "What Matters in Learning from Offline Human Demonstrations for Robot Manipulation", "topic": "dataset-eval", "topicLabel": "Datasets & Benchmarks", "era": "classic", "year": 2021, "venue": "CoRL", "difficulty": 3, "tldr": "这篇不发明新算法，而是把\"机器人看录像学操作\"里每个变量挨个换一遍，告诉你哪些真有用、哪些是白忙。", "wordCount": 2071, "readingMinutes": 6, "tags": [ "diffusion", "manipulation", "RL", "imitation", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/robomimic/", "sourcePath": "https://arxiv.org/abs/2108.03298", "status": "auto-summary-light" }, { "slug": "droid", "num": 34, "title": "DROID", "topic": "dataset-eval", "topicLabel": "Datasets & Benchmarks", "era": "frontier", "year": 2024, "venue": "RSS", "difficulty": 3, "tldr": "全球 18 家实验室一起拍机器人干活的视频，凑出 7.6 万段、564 个真实场景，让机器人不再只会\"自家桌子上那点活\"。", "wordCount": 2308, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "language", "vision", "manipulation", "RL" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/droid/", "sourcePath": "https://arxiv.org/abs/2403.12945", "status": "auto-summary-light" }, { "slug": "open-x-embodiment", "num": 35, "title": "Open X-Embodiment", "topic": "dataset-eval", "topicLabel": "Datasets & Benchmarks", "era": "frontier", "year": 2023, "venue": "ICRA", "difficulty": 3, "tldr": "22 家实验室把各种机器人的\"练手视频\"凑成一个大数据集，再训一个通吃模型，发现喂多种机器人比单喂一种学得更好。", "wordCount": 2646, "readingMinutes": 8, "tags": [ "transformer", "language", "RL", "imitation", "VLA", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/open-x-embodiment/", "sourcePath": "https://arxiv.org/abs/2310.08864", "status": "auto-summary-light" }, { "slug": "robocasa", "num": 36, "title": "RoboCasa", "topic": "dataset-eval", "topicLabel": "Datasets & Benchmarks", "era": "frontier", "year": 2024, "venue": "RSS", "difficulty": 3, "tldr": "想造个会做饭的家用机器人？RoboCasa 给你 120 个虚拟厨房、100 个小动作、十万次练习录像，让它先在游戏里练会，再上岗。", "wordCount": 2198, "readingMinutes": 6, "tags": [ "diffusion", "manipulation", "navigation", "imitation", "VLA", "sim2real" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/robocasa/", "sourcePath": "https://arxiv.org/abs/2406.02523", "status": "auto-summary-light" }, { "slug": "simpler-env", "num": 37, "title": "SimplerEnv", "topic": "dataset-eval", "topicLabel": "Datasets & Benchmarks", "era": "frontier", "year": 2024, "venue": "NeurIPS", "difficulty": 4, "tldr": "不用搬真机器人，在电脑里就能给 VLA（视觉-语言-动作模型）打分，分数和真机几乎一样准。", "wordCount": 2280, "readingMinutes": 7, "tags": [ "3D", "vision", "manipulation", "RL", "VLA", "sim2real" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/simpler-env/", "sourcePath": "https://arxiv.org/abs/2405.05941", "status": "auto-summary-light" }, { "slug": "diffusion-policy", "num": 38, "title": "Diffusion Policy: Visuomotor Policy Learning via Action Diffusion", "topic": "diffusion-policy", "topicLabel": "Diffusion Policy", "era": "founder", "year": 2023, "venue": "RSS", "difficulty": 3, "tldr": "让机器人像调电视雪花一样产生动作：从满屏乱码开始，擦几下，下一步该怎么动就擦出来了。", "wordCount": 6554, "readingMinutes": 19, "tags": [ "diffusion", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/diffusion-policy/", "sourcePath": "papers/diffusion-policy/paper.pdf", "status": "auto-summary" }, { "slug": "3d-diffusion-policy", "num": 39, "title": "3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple 3D Representations", "topic": "diffusion-policy", "topicLabel": "Diffusion Policy", "era": "classic", "year": 2024, "venue": "RSS", "difficulty": 3, "tldr": "让机器人改看 3D 立体形状（点云）而不是 2D 照片来学动作，10 条示范就够，72 个任务平均比原版强 24.2%。", "wordCount": 5167, "readingMinutes": 15, "tags": [ "diffusion", "transformer", "3D", "vision", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/3d-diffusion-policy/", "sourcePath": "papers/3d-diffusion-policy/paper.pdf", "status": "auto-summary" }, { "slug": "consistency-policy", "num": 40, "title": "Consistency Policy: Accelerated Visuomotor Policies via Consistency Distillation", "topic": "diffusion-policy", "topicLabel": "Diffusion Policy", "era": "classic", "year": 2024, "venue": "RSS", "difficulty": 3, "tldr": "机器人选下一步动作本来要慢慢搅 100 下才出一步，这篇教它一下就跳到答案——快约十倍，连笔记本都跑得动。", "wordCount": 6177, "readingMinutes": 18, "tags": [ "diffusion", "flow-matching", "transformer", "RF-radar", "navigation", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/consistency-policy/", "sourcePath": "papers/consistency-policy/paper.pdf", "status": "auto-summary" }, { "slug": "equibot", "num": 41, "title": "EquiBot: SIM(3)-Equivariant Diffusion Policy", "topic": "diffusion-policy", "topicLabel": "Diffusion Policy", "era": "classic", "year": 2024, "venue": "CoRL", "difficulty": 4, "tldr": "教机器人几次就够了。挪位置、转方向、换大小都不用重学，因为这件事直接焊在网络结构里。", "wordCount": 2409, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "3D", "vision", "manipulation", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/equibot/", "sourcePath": "https://arxiv.org/abs/2407.01479", "status": "auto-summary-light" }, { "slug": "dit-policy", "num": 42, "title": "DiT-Policy", "topic": "diffusion-policy", "topicLabel": "Diffusion Policy", "era": "frontier", "year": 2025, "venue": "ICRA", "difficulty": 4, "tldr": "把画图领域火起来的新骨架（DiT）搬到机器人身上，再把每个零件挨个拆开看，到底哪个让它真变好。", "wordCount": 2385, "readingMinutes": 7, "tags": [ "diffusion", "flow-matching", "transformer", "vision", "manipulation", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dit-policy/", "sourcePath": "https://arxiv.org/abs/2410.10088", "status": "auto-summary-light" }, { "slug": "dppo", "num": 43, "title": "Diffusion Policy Policy Optimization (DPPO)", "topic": "diffusion-policy", "topicLabel": "Diffusion Policy", "era": "frontier", "year": 2025, "venue": "ICLR", "difficulty": 4, "tldr": "先模仿老师傅、再自己练。DPPO 把\"自己练\"那步拆成很多小动作，让常规 RL 也能调教扩散策略。", "wordCount": 2447, "readingMinutes": 7, "tags": [ "diffusion", "3D", "vision", "RL", "imitation", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dppo/", "sourcePath": "https://arxiv.org/abs/2409.00588", "status": "auto-summary-light" }, { "slug": "flow-matching-manipulation", "num": 44, "title": "Affordance-based Robot Manipulation with Flow Matching", "topic": "diffusion-policy", "topicLabel": "Diffusion Policy", "era": "frontier", "year": 2024, "venue": "IROS", "difficulty": 3, "tldr": "教机器人做事时，先让它看懂物体能怎么用，再用一种\"画直线\"式的方法直接生成动作——比扩散模型更快更稳。", "wordCount": 2567, "readingMinutes": 7, "tags": [ "diffusion", "flow-matching", "transformer", "vision", "manipulation", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/flow-matching-manipulation/", "sourcePath": "https://arxiv.org/abs/2409.01083", "status": "auto-summary-light" }, { "slug": "flow-policy", "num": 45, "title": "FlowPolicy: 3D Flow-based Policy via Consistency Flow Matching", "topic": "diffusion-policy", "topicLabel": "Diffusion Policy", "era": "frontier", "year": 2025, "venue": "AAAI", "difficulty": 4, "tldr": "让机器人不再\"在脑子里画 100 张草稿才动手\"，而是看一眼立体世界就一步给出动作 — 又快又稳，真机能跑得动。", "wordCount": 2634, "readingMinutes": 8, "tags": [ "diffusion", "flow-matching", "vision", "manipulation", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/flow-policy/", "sourcePath": "https://arxiv.org/abs/2412.04987", "status": "auto-summary-light" }, { "slug": "pi0-fast", "num": 46, "title": "FAST: Efficient Action Tokenization for VLA", "topic": "diffusion-policy", "topicLabel": "Diffusion Policy", "era": "frontier", "year": 2025, "venue": "RSS", "difficulty": 4, "tldr": "机器人动作又长又啰嗦塞不进 AI 模型，FAST 学 MP3 压音乐的办法，把一长串动作压成几十个\"词\"，AI 像说话一样把它念出来。", "wordCount": 2492, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "language", "manipulation", "VLA", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/pi0-fast/", "sourcePath": "https://arxiv.org/abs/2501.09747", "status": "auto-summary-light" }, { "slug": "pi0", "num": 47, "title": "pi_0: Vision-Language-Action Flow Model", "topic": "diffusion-policy", "topicLabel": "Diffusion Policy", "era": "frontier", "year": 2024, "venue": "arXiv", "difficulty": 4, "tldr": "让机器人看懂场景、听懂指令、还能丝滑动起来——拿现成的图文大模型当\"大脑\"，再加一个会画连续动作的\"流匹配\"小头。", "wordCount": 2618, "readingMinutes": 7, "tags": [ "diffusion", "flow-matching", "transformer", "language", "vision", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/pi0/", "sourcePath": "https://arxiv.org/abs/2410.24164", "status": "auto-summary-light" }, { "slug": "pi05", "num": 48, "title": "pi_0.5: VLA with Open-World Generalization", "topic": "diffusion-policy", "topicLabel": "Diffusion Policy", "era": "frontier", "year": 2025, "venue": "arXiv", "difficulty": 5, "tldr": "让机器人第一次走进一个陌生人家，也能听懂\"收拾下厨房\"然后自己一步步把活干完。", "wordCount": 2353, "readingMinutes": 7, "tags": [ "diffusion", "flow-matching", "transformer", "language", "imitation", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/pi05/", "sourcePath": "https://arxiv.org/abs/2504.16054", "status": "auto-summary-light" }, { "slug": "dagger", "num": 49, "title": "A Reduction of Imitation Learning and Structured Prediction to No-Regret Online Learning", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "founder", "year": 2011, "venue": "AISTATS", "difficulty": 4, "tldr": "光看老师开车的录像不够 — 学生一走偏就越错越离谱。DAgger 让学生自己先开几圈，把走偏的地方拿去问老师答案，再训，反复几轮就稳了。", "wordCount": 7322, "readingMinutes": 21, "tags": [ "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dagger/", "sourcePath": "papers/dagger/paper.pdf", "status": "auto-summary" }, { "slug": "gail", "num": 50, "title": "Generative Adversarial Imitation Learning", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "founder", "year": 2016, "venue": "NeurIPS", "difficulty": 4, "tldr": "让 AI 看大厨做菜的录像，再找个\"挑刺老师\"分辨它做得像不像，靠这种较劲学会做事，不用猜大厨心里的打分标准。", "wordCount": 6398, "readingMinutes": 18, "tags": [ "RL", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/gail/", "sourcePath": "papers/gail/paper.pdf", "status": "auto-summary" }, { "slug": "act-aloha", "num": 51, "title": "Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (ACT/ALOHA)", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "classic", "year": 2023, "venue": "RSS", "difficulty": 3, "tldr": "几千美元搭一套双臂遥控器（ALOHA）让人录 50 次示范，机器人就学会一段一段动（ACT），能完成穿扎带这种细活。", "wordCount": 2601, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "manipulation", "RL", "imitation", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/act-aloha/", "sourcePath": "https://arxiv.org/abs/2304.13705", "status": "auto-summary-light" }, { "slug": "anyteleop", "num": 52, "title": "AnyTeleop", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "classic", "year": 2023, "venue": "CoRL", "difficulty": 3, "tldr": "用一台普通摄像头拍你的手，机械手就跟着模仿你的动作；换什么型号的机械手都不用重写代码。", "wordCount": 2280, "readingMinutes": 7, "tags": [ "diffusion", "3D", "vision", "manipulation", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/anyteleop/", "sourcePath": "https://arxiv.org/abs/2307.04577", "status": "auto-summary-light" }, { "slug": "bet", "num": 53, "title": "Behavior Transformers: Cloning k Modes with One Stone", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "classic", "year": 2022, "venue": "NeurIPS", "difficulty": 3, "tldr": "看一堆人做同一件事却各有各的做法，BeT 让 AI 先认出\"有几种主流流派\"，再在每个流派里微调——而不是把所有动作平均成一个四不像。", "wordCount": 2385, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "language", "RL", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/bet/", "sourcePath": "https://arxiv.org/abs/2206.11251", "status": "auto-summary-light" }, { "slug": "ibc", "num": 54, "title": "Implicit Behavioral Cloning", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "classic", "year": 2021, "venue": "CoRL", "difficulty": 4, "tldr": "别让模型直接报\"动作是这个\"，而是让它给一堆候选动作打分、挑最低分那个——机器人的手就突然变巧了。", "wordCount": 6215, "readingMinutes": 18, "tags": [ "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/ibc/", "sourcePath": "papers/ibc/paper.pdf", "status": "auto-summary" }, { "slug": "robocat", "num": 55, "title": "RoboCat", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "classic", "year": 2023, "venue": "TMLR", "difficulty": 4, "tldr": "一个 AI 大脑同时指挥好几种不同的机械臂干活，干完还会把成功的录像收回来当作下一轮的教材，越练越强。", "wordCount": 2370, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "language", "vision", "manipulation", "RL" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/robocat/", "sourcePath": "https://arxiv.org/abs/2306.11706", "status": "auto-summary-light" }, { "slug": "aloha-2", "num": 56, "title": "ALOHA 2", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "frontier", "year": 2024, "venue": "Tech Report", "difficulty": 2, "tldr": "ALOHA 2 不是新算法，而是把\"教机器人用双手干活\"的那台设备升级了一遍：更顺手、更耐用、图纸全开源，方便大家一起攒训练数据。", "wordCount": 2546, "readingMinutes": 7, "tags": [ "diffusion", "manipulation", "imitation", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/aloha-2/", "sourcePath": "https://arxiv.org/abs/2405.02292", "status": "auto-summary-light" }, { "slug": "dexcap", "num": 57, "title": "DexCap", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "frontier", "year": 2024, "venue": "RSS", "difficulty": 3, "tldr": "人戴上\"会记录动作的手套\"自己干活，把手的轨迹录下来教机器人——机器人完全不必在现场。", "wordCount": 2481, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "vision", "manipulation", "navigation", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dexcap/", "sourcePath": "https://arxiv.org/abs/2403.07788", "status": "auto-summary-light" }, { "slug": "humanplus", "num": 58, "title": "HumanPlus", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "frontier", "year": 2024, "venue": "CoRL", "difficulty": 4, "tldr": "HumanPlus 让机器人当场跟着人做动作，做几十次后机器人自己也会了——把人当成机器人的\"示范老师\"。", "wordCount": 2152, "readingMinutes": 6, "tags": [ "diffusion", "transformer", "locomotion", "RL", "imitation", "sim2real" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/humanplus/", "sourcePath": "https://arxiv.org/abs/2406.10454", "status": "auto-summary-light" }, { "slug": "idp3", "num": 59, "title": "Generalizable Humanoid Manipulation with 3D Diffusion Policies (iDP3)", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "frontier", "year": 2025, "venue": "RSS", "difficulty": 4, "tldr": "让人形机器人用\"自己眼睛\"的视角看世界（而不是死记房间地图）。换间屋子也照样干活，不用重学。", "wordCount": 2572, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "3D", "language", "vision", "manipulation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/idp3/", "sourcePath": "https://arxiv.org/abs/2410.10803", "status": "auto-summary-light" }, { "slug": "mobile-aloha", "num": 60, "title": "Mobile ALOHA", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "frontier", "year": 2024, "venue": "CoRL", "difficulty": 3, "tldr": "给桌面机器人加了一辆小车，让人手把手带它做家务（炒虾、擦桌、洗碗），每招只示范 50 次就能学会。", "wordCount": 2235, "readingMinutes": 6, "tags": [ "diffusion", "transformer", "vision", "locomotion", "navigation", "RL" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/mobile-aloha/", "sourcePath": "https://arxiv.org/abs/2401.02117", "status": "auto-summary-light" }, { "slug": "smolvla", "num": 61, "title": "SmolVLA", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "frontier", "year": 2025, "venue": "arXiv", "difficulty": 3, "tldr": "Hugging Face 推出的小型机器人模型：把\"看到 + 听到 + 动手\"塞进一张游戏显卡能训的小脑袋，让没数据中心的人也能在家玩具身 AI。", "wordCount": 2004, "readingMinutes": 6, "tags": [ "diffusion", "flow-matching", "language", "manipulation", "imitation", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/smolvla/", "sourcePath": "https://arxiv.org/abs/2506.01844", "status": "auto-summary-light" }, { "slug": "umi", "num": 62, "title": "Universal Manipulation Interface", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "frontier", "year": 2024, "venue": "RSS", "difficulty": 3, "tldr": "人手拿一个\"带摄像头的夹子\"在厨房自己做事，录下来就能教机器人，全程不用机器人在场。", "wordCount": 2724, "readingMinutes": 8, "tags": [ "diffusion", "vision", "manipulation", "navigation", "imitation", "sim2real" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/umi/", "sourcePath": "https://arxiv.org/abs/2402.10329", "status": "auto-summary-light" }, { "slug": "vq-bet", "num": 63, "title": "Behavior Generation with Latent Actions (VQ-BeT)", "topic": "imitation", "topicLabel": "Imitation Learning", "era": "frontier", "year": 2024, "venue": "ICML", "difficulty": 4, "tldr": "机器人本来要画一条平滑曲线动作，VQ-BeT 让它改成\"先选一个动作词、再小修一点\"——就像挑表情包再加文字，比硬画曲线更不容易出怪招。", "wordCount": 2379, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "language", "imitation", "VLA", "sim2real" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/vq-bet/", "sourcePath": "https://arxiv.org/abs/2403.03181", "status": "auto-summary-light" }, { "slug": "imagebind", "num": 64, "title": "ImageBind: One Embedding Space To Bind Them All", "topic": "multimodal", "topicLabel": "Multimodal Ecology", "era": "founder", "year": 2023, "venue": "CVPR", "difficulty": 3, "tldr": "把图片当翻译官，六种感官（图、文、声、深度、热、动作）就能互相听懂彼此说话。", "wordCount": 6102, "readingMinutes": 17, "tags": [ "transformer", "vision" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/imagebind/", "sourcePath": "papers/imagebind/paper.pdf", "status": "auto-summary" }, { "slug": "touch-vision-cross-modal", "num": 65, "title": "Connecting Touch and Vision via Cross-Modal Prediction", "topic": "multimodal", "topicLabel": "Multimodal Ecology", "era": "founder", "year": 2019, "venue": "CVPR", "difficulty": 3, "tldr": "教 AI\"看一眼就猜出摸起来什么感觉、摸一下就猜出在摸哪儿\"，让视觉和触觉互相翻译。", "wordCount": 6810, "readingMinutes": 19, "tags": [ "vision", "tactile", "audio-speech", "navigation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/touch-vision-cross-modal/", "sourcePath": "papers/touch-vision-cross-modal/paper.pdf", "status": "auto-summary" }, { "slug": "anymal", "num": 66, "title": "AnyMAL: An Efficient and Scalable Any-Modality Augmented Language Model", "topic": "multimodal", "topicLabel": "Multimodal Ecology", "era": "classic", "year": 2023, "venue": "EACL", "difficulty": 3, "tldr": "一句话：给一个\"只识字\"的聪明大脑配几副翻译眼镜——看图、看视频、听声、感运动，统统先翻成\"假文字\"再喂进去，大脑本身一个字都不重学。三件让人眼前一亮的事：不动 LLM 主干：LLaMA-2-70B 全程冻结，只训前面那个小投影层（projection layer），训练成本", "wordCount": 5896, "readingMinutes": 17, "tags": [ "language", "vision" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/anymal/", "sourcePath": "papers/anymal/paper.pdf", "status": "auto-summary" }, { "slug": "audiopalm", "num": 67, "title": "AudioPaLM", "topic": "multimodal", "topicLabel": "Multimodal Ecology", "era": "classic", "year": 2023, "venue": "arXiv", "difficulty": 4, "tldr": "以前要三个工人接力——听写、翻译、配音——才能把你说的中文变成英文语音。AudioPaLM 让一个模型一口气干完，连你的音色都不丢。", "wordCount": 2284, "readingMinutes": 7, "tags": [ "transformer", "language", "audio-speech", "navigation", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/audiopalm/", "sourcePath": "https://arxiv.org/abs/2306.12925", "status": "auto-summary-light" }, { "slug": "fromage", "num": 68, "title": "FROMAGe: Grounding LLMs to Images", "topic": "multimodal", "topicLabel": "Multimodal Ecology", "era": "classic", "year": 2023, "venue": "ICML", "difficulty": 3, "tldr": "把一个会说话的大模型整个冻住不动，只在它前后各加一层薄薄的\"翻译片\"，就让它能看图、找图、还能图文混着聊天。", "wordCount": 2225, "readingMinutes": 6, "tags": [ "language", "vision" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/fromage/", "sourcePath": "https://arxiv.org/abs/2301.13823", "status": "auto-summary-light" }, { "slug": "onellm", "num": 69, "title": "OneLLM", "topic": "multimodal", "topicLabel": "Multimodal Ecology", "era": "classic", "year": 2024, "venue": "CVPR", "difficulty": 3, "tldr": "OneLLM 用一套通用「翻译机」，让大语言模型同时听懂图像、声音、点云等八种信号——加新信号只要少量训练，不用从头再做。", "wordCount": 2291, "readingMinutes": 7, "tags": [ "transformer", "language", "vision", "audio-speech", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/onellm/", "sourcePath": "https://arxiv.org/abs/2312.03700", "status": "auto-summary-light" }, { "slug": "x-vlm", "num": 70, "title": "X-VLM: Multi-Grained Vision Language Pre-Training", "topic": "multimodal", "topicLabel": "Multimodal Ecology", "era": "classic", "year": 2022, "venue": "ICML", "difficulty": 4, "tldr": "教 AI 看图，不只学\"整张图配整句话\"，还学\"图里某个物体配某个词\"——这样问图里某个细节也答得准。", "wordCount": 2377, "readingMinutes": 7, "tags": [ "transformer", "language", "vision", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/x-vlm/", "sourcePath": "https://arxiv.org/abs/2111.08276", "status": "auto-summary-light" }, { "slug": "sparsh-x", "num": 71, "title": "Tactile Beyond Pixels (Sparsh-X)", "topic": "multimodal", "topicLabel": "Multimodal Ecology", "era": "frontier", "year": 2025, "venue": "CoRL", "difficulty": 4, "tldr": "让机器人的手指不止\"看\"接触画面，还能听响声、感力度、察打滑——四路信号一起学，摸东西才像人。", "wordCount": 2491, "readingMinutes": 7, "tags": [ "transformer", "vision", "tactile", "manipulation", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/sparsh-x/", "sourcePath": "https://arxiv.org/abs/2506.14754", "status": "auto-summary-light" }, { "slug": "sparsh", "num": 72, "title": "Sparsh: Self-supervised Touch Representations", "topic": "multimodal", "topicLabel": "Multimodal Ecology", "era": "frontier", "year": 2024, "venue": "CoRL", "difficulty": 4, "tldr": "以前每个触觉任务都得从零教机器人。Sparsh 先让模型自己看大量触觉画面学一遍，再做具体任务只要少量例子就够。类比：跟小孩先摸过几千次东西、再去学\"握紧水杯\"是一个道理。技术路线和 NLP 里 BERT、视觉里 DINO 一致——先大量自学，再小量微调，只是搬到了触觉这个长期", "wordCount": 2885, "readingMinutes": 8, "tags": [ "transformer", "vision", "tactile", "VLA", "sim2real", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/sparsh/", "sourcePath": "https://arxiv.org/abs/2410.24090", "status": "auto-summary-light" }, { "slug": "tactile-vla", "num": 73, "title": "Tactile-VLA", "topic": "multimodal", "topicLabel": "Multimodal Ecology", "era": "frontier", "year": 2025, "venue": "CoRL", "difficulty": 4, "tldr": "让机器人除了会看会听，还学会\"摸\"——能感到扣子\"咔哒\"卡入那一下，干插拔、拧螺丝这种细活不再蛮干。", "wordCount": 2702, "readingMinutes": 8, "tags": [ "transformer", "language", "vision", "tactile", "imitation", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/tactile-vla/", "sourcePath": "https://arxiv.org/abs/2507.09160", "status": "auto-summary-light" }, { "slug": "tla-tactile-language-action", "num": 74, "title": "TLA: Tactile-Language-Action", "topic": "multimodal", "topicLabel": "Multimodal Ecology", "era": "frontier", "year": 2025, "venue": "ICRA", "difficulty": 4, "tldr": "让机器人像你闭眼摸钥匙那样——靠\"一段持续的触感\"加上一句话指令，自己决定下一步该怎么用手。", "wordCount": 2334, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "3D", "language", "vision", "tactile" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/tla-tactile-language-action/", "sourcePath": "https://arxiv.org/abs/2503.08548", "status": "auto-summary-light" }, { "slug": "code-as-policies", "num": 75, "title": "Code as Policies: Language Model Programs for Embodied Control", "topic": "planning", "topicLabel": "High-Level Planning", "era": "founder", "year": 2023, "venue": "ICRA", "difficulty": 3, "tldr": "你说一句\"把方块叠进碗里\"，AI 当场写几行 Python 代码，机器人立刻照着跑。不用提前教它新动作。", "wordCount": 5094, "readingMinutes": 15, "tags": [ "language", "manipulation", "RL", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/code-as-policies/", "sourcePath": "papers/code-as-policies/paper.pdf", "status": "auto-summary" }, { "slug": "inner-monologue", "num": 76, "title": "Inner Monologue: Embodied Reasoning through Planning with Language Models", "topic": "planning", "topicLabel": "High-Level Planning", "era": "founder", "year": 2022, "venue": "CoRL", "difficulty": 3, "tldr": "让机器人边干活边在心里念叨：看到啥、做成没、人改主意没，全翻成文字塞回 AI，它就能边做边改计划。", "wordCount": 5505, "readingMinutes": 16, "tags": [ "language", "vision", "manipulation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/inner-monologue/", "sourcePath": "papers/inner-monologue/paper.pdf", "status": "auto-summary" }, { "slug": "llm-plus-p", "num": 77, "title": "LLM+P: Empowering LLMs with Optimal Planning", "topic": "planning", "topicLabel": "High-Level Planning", "era": "founder", "year": 2023, "venue": "arXiv", "difficulty": 3, "tldr": "让 LLM 只当翻译——把你说的话翻译成机器格式，真正的规划交给老牌算法去算。LLM 管说话，算法管动脑子。", "wordCount": 1995, "readingMinutes": 6, "tags": [ "language", "RL", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/llm-plus-p/", "sourcePath": "https://arxiv.org/abs/2304.11477", "status": "auto-summary-light" }, { "slug": "palm-e", "num": 78, "title": "PaLM-E: An Embodied Multimodal Language Model", "topic": "planning", "topicLabel": "High-Level Planning", "era": "founder", "year": 2023, "venue": "ICML", "difficulty": 4, "tldr": "教 ChatGPT 长出眼睛和手脚：你说一句话 + 让它瞄一眼现场，它直接列出机器人该做的几步。", "wordCount": 6964, "readingMinutes": 20, "tags": [ "transformer", "language", "vision", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/palm-e/", "sourcePath": "papers/palm-e/paper.pdf", "status": "auto-summary" }, { "slug": "progprompt", "num": 79, "title": "ProgPrompt", "topic": "planning", "topicLabel": "High-Level Planning", "era": "founder", "year": 2023, "venue": "ICRA", "difficulty": 2, "tldr": "让大模型像写代码一样做计划：你说\"把苹果放冰箱\"，它直接吐出一串 Python 调用，机器人照着一行行跑就行。", "wordCount": 2154, "readingMinutes": 6, "tags": [ "language", "VLA", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/progprompt/", "sourcePath": "https://arxiv.org/abs/2209.11302", "status": "auto-summary-light" }, { "slug": "chatgpt-for-robotics", "num": 80, "title": "ChatGPT for Robotics", "topic": "planning", "topicLabel": "High-Level Planning", "era": "classic", "year": 2023, "venue": "IEEE Access", "difficulty": 2, "tldr": "教 ChatGPT 当机器人的\"代写助理\"：先告诉它机器人会做哪些事，再让它把人话翻成代码，人盯着改。", "wordCount": 2194, "readingMinutes": 6, "tags": [ "language", "RL", "imitation", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/chatgpt-for-robotics/", "sourcePath": "https://arxiv.org/abs/2306.17582", "status": "auto-summary-light" }, { "slug": "gensim", "num": 81, "title": "GenSim", "topic": "planning", "topicLabel": "High-Level Planning", "era": "classic", "year": 2024, "venue": "ICLR", "difficulty": 3, "tldr": "让 ChatGPT 当\"出题老师\"，自动给机器人编一堆练习关卡，连标准答案也一起写好。", "wordCount": 2114, "readingMinutes": 6, "tags": [ "language", "vision", "manipulation", "imitation", "sim2real", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/gensim/", "sourcePath": "https://arxiv.org/abs/2310.01361", "status": "auto-summary-light" }, { "slug": "roboflamingo", "num": 82, "title": "RoboFlamingo", "topic": "planning", "topicLabel": "High-Level Planning", "era": "classic", "year": 2024, "venue": "ICLR", "difficulty": 4, "tldr": "拿一个已经会看图说话的现成大模型当大脑，后面接一只\"小手\"，就教会机械臂干活——不用从头训。", "wordCount": 2089, "readingMinutes": 6, "tags": [ "diffusion", "flow-matching", "transformer", "language", "vision", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/roboflamingo/", "sourcePath": "https://arxiv.org/abs/2311.01378", "status": "auto-summary-light" }, { "slug": "tree-planner", "num": 83, "title": "Tree-Planner", "topic": "planning", "topicLabel": "High-Level Planning", "era": "classic", "year": 2024, "venue": "ICLR", "difficulty": 3, "tldr": "让大模型一次写好十份菜谱，把重复步骤合成一棵树，做菜时照树走，错了就换条岔路，不用反复打电话问。", "wordCount": 2402, "readingMinutes": 7, "tags": [ "language", "locomotion", "world-model", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/tree-planner/", "sourcePath": "https://arxiv.org/abs/2310.08582", "status": "auto-summary-light" }, { "slug": "voxposer", "num": 84, "title": "VoxPoser", "topic": "planning", "topicLabel": "High-Level Planning", "era": "classic", "year": 2023, "venue": "CoRL", "difficulty": 4, "tldr": "VoxPoser 让大模型给机器人画两张 3D 地图：红色地方要去，灰色地方要躲，机器人照着地图走出动作，全程不训练新模型。", "wordCount": 2090, "readingMinutes": 6, "tags": [ "diffusion", "3D", "language", "vision", "RL", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/voxposer/", "sourcePath": "https://arxiv.org/abs/2307.05973", "status": "auto-summary-light" }, { "slug": "millimap", "num": 85, "title": "See Through Smoke: Robust Indoor Mapping with Low-cost mmWave Radar", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "founder", "year": 2020, "venue": "SenSys", "difficulty": 3, "tldr": "机器人在浓烟里也能画出清晰的房间地图——靠一颗几十块的小雷达加一个会\"脑补\"的神经网络。具体两招：训练时让贵的激光雷达（lidar）和便宜的雷达坐同一辆车，把 lidar 的清晰图当作业答案喂给神经网络（cGAN），教雷达学会脑补。学完老师下车，雷达单飞。认门/墙/玻璃/电", "wordCount": 6108, "readingMinutes": 17, "tags": [ "vision", "RF-radar", "navigation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/millimap/", "sourcePath": "papers/millimap/paper.pdf", "status": "auto-summary" }, { "slug": "person-in-wifi", "num": 86, "title": "Can WiFi Estimate Person Pose?", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "founder", "year": 2019, "venue": "ICCV", "difficulty": 3, "tldr": "想象你家路由器除了上网，还能告诉你\"屋里那个人正在做啥姿势\"——胳膊抬到哪、腿怎么弯，全画给你看。", "wordCount": 6303, "readingMinutes": 18, "tags": [ "RF-radar" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/person-in-wifi/", "sourcePath": "papers/person-in-wifi/paper.pdf", "status": "auto-summary" }, { "slug": "3drimr", "num": 87, "title": "3DRIMR: 3D Reconstruction and Imaging via mmWave Radar based on Deep Learning", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "classic", "year": 2021, "venue": "IPCCC", "difficulty": 3, "tldr": "用 AI 教小雷达\"看清\"物体长啥样：从糊糊的电波信号里还原出完整 3D 形状，烟雾灰尘暗光里也能用。", "wordCount": 2599, "readingMinutes": 7, "tags": [ "3D", "RF-radar", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/3drimr/", "sourcePath": "https://arxiv.org/abs/2108.02858", "status": "auto-summary-light" }, { "slug": "milliego", "num": 88, "title": "milliEgo: Single-chip mmWave Radar Aided Egomotion Estimation via Deep Sensor Fusion", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "classic", "year": 2020, "venue": "SenSys", "difficulty": 3, "tldr": "把便宜的毫米波雷达和身上的\"动作感应器\"（IMU）用神经网络拼起来，让机器在黑暗、烟雾里也能算出自己走到了哪。", "wordCount": 2179, "readingMinutes": 6, "tags": [ "transformer", "vision", "RF-radar", "navigation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/milliego/", "sourcePath": "https://arxiv.org/abs/2006.02266", "status": "auto-summary-light" }, { "slug": "radarhd", "num": 89, "title": "High Resolution Point Clouds from mmWave Radar", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "classic", "year": 2023, "venue": "ICRA", "difficulty": 3, "tldr": "便宜雷达拍出来的画面很糊。RadarHD 用神经网络当翻译，把糊画面改成像激光雷达那样清晰的点云图，烟雾、黑暗里都能用。", "wordCount": 2660, "readingMinutes": 8, "tags": [ "3D", "RF-radar", "navigation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/radarhd/", "sourcePath": "https://arxiv.org/abs/2206.09273", "status": "auto-summary-light" }, { "slug": "radarslam", "num": 90, "title": "RadarSLAM: Radar based Large-Scale SLAM in All Weathers", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "classic", "year": 2020, "venue": "BMVC", "difficulty": 4, "tldr": "让一台\"会转圈的雷达\"在大雾大雪天里也能给车画地图、记住自己走过哪。", "wordCount": 2561, "readingMinutes": 7, "tags": [ "vision", "RF-radar", "navigation", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/radarslam/", "sourcePath": "https://arxiv.org/abs/2005.02198", "status": "auto-summary-light" }, { "slug": "rf-pose-through-wall", "num": 91, "title": "Through-Wall Pose Imaging in Real-Time with a Many-to-Many Encoder/Decoder Paradigm", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "classic", "year": 2019, "venue": "arXiv", "difficulty": 4, "tldr": "一个 Wi-Fi 小盒子隔着墙照过去，就能画出屋里人的骨架动画——摄像头当老师，电波当学生，学一遍就会了。更具体一点：输入：一个商用雷达（Walabot Developer，几百美元）发出去的电波被人体反射回来后形成的 3D 强度场。输出：屋内每个人的 15 关节点骨架，每", "wordCount": 11437, "readingMinutes": 33, "tags": [ "vision", "RF-radar" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rf-pose-through-wall/", "sourcePath": "papers/rf-pose-through-wall/paper.pdf", "status": "auto-summary" }, { "slug": "rfmask", "num": 92, "title": "RFMask: A Simple Baseline for Human Silhouette Segmentation with Radio Signals", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "classic", "year": 2022, "venue": "TMM", "difficulty": 3, "tldr": "漆黑屋子里相机看不见，但雷达回波能\"听\"出人形。RFMask 让模型把雷达信号直接画成每个人的精细剪影——头、肩、胳膊都画出来。", "wordCount": 2615, "readingMinutes": 7, "tags": [ "vision", "RF-radar" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rfmask/", "sourcePath": "https://arxiv.org/abs/2201.10175", "status": "auto-summary-light" }, { "slug": "rfpose-ot", "num": 93, "title": "RFPose-OT: RF-Based 3D Human Pose Estimation via Optimal Transport Theory", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "classic", "year": 2023, "venue": "TCSVT", "difficulty": 4, "tldr": "用雷达回声画出人的姿势：直接学容易乱猜，先把\"回声\"和\"姿势\"两边的特征对齐，再画关节，换房间也更稳。", "wordCount": 2475, "readingMinutes": 7, "tags": [ "3D", "RF-radar" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rfpose-ot/", "sourcePath": "https://arxiv.org/abs/2301.13013", "status": "auto-summary-light" }, { "slug": "argus-mmego", "num": 94, "title": "Argus: Multi-View Egocentric Human Mesh Reconstruction Based on Stripped-Down Wearable mmWave Add-on", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "frontier", "year": 2024, "venue": "SenSys", "difficulty": 4, "tldr": "在肩膀、胸口、手腕各贴一片简化雷达，每片只能看到身体一小块，算法把这些局部信号拼成完整的 3D 人体形状。", "wordCount": 2389, "readingMinutes": 7, "tags": [ "transformer", "3D", "vision", "audio-speech", "RF-radar" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/argus-mmego/", "sourcePath": "https://arxiv.org/abs/2411.00419", "status": "auto-summary-light" }, { "slug": "mmdiff", "num": 95, "title": "Diffusion Model is a Good Pose Estimator from 3D RF-Vision", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "frontier", "year": 2024, "venue": "CVPR", "difficulty": 4, "tldr": "毫米波雷达拍出的人像隔了层毛玻璃。这篇论文让 AI 从噪点里一笔笔擦出人体骨架，比一次猜准稳得多。CVPR 2024 收录。", "wordCount": 2894, "readingMinutes": 8, "tags": [ "diffusion", "transformer", "3D", "vision", "RF-radar", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/mmdiff/", "sourcePath": "https://arxiv.org/abs/2403.16198", "status": "auto-summary-light" }, { "slug": "panoradar", "num": 96, "title": "Enabling Visual Recognition at Radio Frequency (PanoRadar)", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "frontier", "year": 2024, "venue": "MobiCom", "difficulty": 4, "tldr": "PanoRadar 把便宜的小雷达装到一个转台上边转边扫，再让神经网络把模糊回声拼成 3D 地图，让雷达像眼睛一样\"看见\"房间。", "wordCount": 2696, "readingMinutes": 8, "tags": [ "vision", "audio-speech", "RF-radar", "navigation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/panoradar/", "sourcePath": "https://arxiv.org/abs/2405.19516", "status": "auto-summary-light" }, { "slug": "wave-former", "num": 97, "title": "Wave-Former: Through-Occlusion 3D Reconstruction via Wireless Shape Completion", "topic": "rf", "topicLabel": "RF Perception & Mapping", "era": "frontier", "year": 2025, "venue": "arXiv", "difficulty": 4, "tldr": "毫米波信号能穿过纸箱、布帘，Wave-Former 把弹回来的模糊回声拼成藏在背后的杯子、瓶子的完整 3D 形状。", "wordCount": 2485, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "3D", "vision", "audio-speech", "RF-radar" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/wave-former/", "sourcePath": "https://arxiv.org/abs/2511.14152", "status": "auto-summary-light" }, { "slug": "habitat", "num": 98, "title": "Habitat: A Platform for Embodied AI Research", "topic": "sim", "topicLabel": "Simulation & Sim2Real", "era": "founder", "year": 2019, "venue": "ICCV", "difficulty": 2, "tldr": "给家用机器人造一个跑得飞快的\"VR 房子\"，让它在里面绕路撞墙练几千万步，再上岗去你家。", "wordCount": 6056, "readingMinutes": 17, "tags": [ "vision", "navigation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/habitat/", "sourcePath": "papers/habitat/paper.pdf", "status": "auto-summary" }, { "slug": "isaac-gym", "num": 99, "title": "Isaac Gym: High Performance GPU-Based Physics Simulation For Robot Learning", "topic": "sim", "topicLabel": "Simulation & Sim2Real", "era": "founder", "year": 2021, "venue": "NeurIPS Datasets", "difficulty": 3, "tldr": "一句话：把\"算物理\"和\"训神经网络\"塞进同一张显卡，机器人学走路从\"几千台 CPU 跑一晚\"压成\"一张卡跑几分钟\"。类比：以前训机器人像切菜、炒菜、装盘分三个房间，端来端去比真做菜还累；Isaac Gym 把厨房合并，菜不动、工具换着上。效果对照：OpenAI 训魔方机械手用", "wordCount": 5362, "readingMinutes": 15, "tags": [ "RL", "sim2real" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/isaac-gym/", "sourcePath": "papers/isaac-gym/paper.pdf", "status": "auto-summary" }, { "slug": "dexmv", "num": 100, "title": "DexMV", "topic": "sim", "topicLabel": "Simulation & Sim2Real", "era": "classic", "year": 2022, "venue": "ECCV", "difficulty": 4, "tldr": "让机械手学拧瓶盖、倒水太难，DexMV 让算法看人手视频学，把人的动作\"翻译\"成仿真里机械手能照着练的示范。", "wordCount": 2271, "readingMinutes": 6, "tags": [ "3D", "vision", "manipulation", "RL", "imitation", "sim2real" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dexmv/", "sourcePath": "https://arxiv.org/abs/2108.05877", "status": "auto-summary-light" }, { "slug": "habitat-2", "num": 101, "title": "Habitat 2.0", "topic": "sim", "topicLabel": "Simulation & Sim2Real", "era": "classic", "year": 2021, "venue": "NeurIPS", "difficulty": 3, "tldr": "上一代 Habitat 只能在虚拟房子里走路看；2.0 让小机器人能真的开冰箱、把杯子从厨房拿到客厅做家务。", "wordCount": 2156, "readingMinutes": 6, "tags": [ "3D", "manipulation", "RL", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/habitat-2/", "sourcePath": "https://arxiv.org/abs/2106.14405", "status": "auto-summary-light" }, { "slug": "maniskill", "num": 102, "title": "ManiSkill", "topic": "sim", "topicLabel": "Simulation & Sim2Real", "era": "classic", "year": 2021, "venue": "NeurIPS", "difficulty": 3, "tldr": "ManiSkill 是教机器人开抽屉、开柜门这种家具的统一考场—— 专测它练完几十个柜子之后，能不能上手没见过的第 101 个。", "wordCount": 2042, "readingMinutes": 6, "tags": [ "diffusion", "3D", "vision", "manipulation", "RL", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/maniskill/", "sourcePath": "https://arxiv.org/abs/2107.14483", "status": "auto-summary-light" }, { "slug": "procthor", "num": 103, "title": "ProcTHOR", "topic": "sim", "topicLabel": "Simulation & Sim2Real", "era": "classic", "year": 2022, "venue": "NeurIPS", "difficulty": 3, "tldr": "过去训练 AI 在屋里走来走去，得人工一间一间搭样板房，慢且少。ProcTHOR 让电脑按规则批量造 1 万套房，AI 见多了，换个没去过的房子也能找到东西。", "wordCount": 2071, "readingMinutes": 6, "tags": [ "language", "sim2real", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/procthor/", "sourcePath": "https://arxiv.org/abs/2206.06994", "status": "auto-summary-light" }, { "slug": "sapien", "num": 104, "title": "SAPIEN: A SimulAted Part-based Interactive ENvironment", "topic": "sim", "topicLabel": "Simulation & Sim2Real", "era": "classic", "year": 2020, "venue": "CVPR", "difficulty": 3, "tldr": "给机器人造了一个虚拟宜家展厅，2,346 件家具每个抽屉、每扇门、每个瓶盖都能真的拉开、推开、拧开。", "wordCount": 6256, "readingMinutes": 18, "tags": [ "3D", "RL" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/sapien/", "sourcePath": "papers/sapien/paper.pdf", "status": "auto-summary" }, { "slug": "behavior-1k", "num": 105, "title": "BEHAVIOR-1K", "topic": "sim", "topicLabel": "Simulation & Sim2Real", "era": "frontier", "year": 2024, "venue": "CoRL", "difficulty": 4, "tldr": "斯坦福搭的\"机器人家务考场\"：1000 道家务题、50 间样板房、9000 多件物品，让所有人用同一把尺子比\"机器人到底会不会做家务\"。", "wordCount": 1983, "readingMinutes": 6, "tags": [ "3D", "RL", "imitation", "VLA", "sim2real", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/behavior-1k/", "sourcePath": "https://arxiv.org/abs/2403.09227", "status": "auto-summary-light" }, { "slug": "habitat-3", "num": 106, "title": "Habitat 3.0", "topic": "sim", "topicLabel": "Simulation & Sim2Real", "era": "frontier", "year": 2024, "venue": "ICLR", "difficulty": 3, "tldr": "在虚拟的家里加一个会走会动的\"假人\"，让机器人练习扫地搬东西时，得学会一边干活一边躲人、配合人。", "wordCount": 2405, "readingMinutes": 7, "tags": [ "language", "manipulation", "locomotion", "navigation", "RL", "sim2real" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/habitat-3/", "sourcePath": "https://arxiv.org/abs/2310.13724", "status": "auto-summary-light" }, { "slug": "isaac-lab", "num": 107, "title": "Isaac Lab", "topic": "sim", "topicLabel": "Simulation & Sim2Real", "era": "frontier", "year": 2025, "venue": "arXiv", "difficulty": 3, "tldr": "机器人在电脑里\"练功\"的虚拟训练场。以前练得飞快但看不清画面，画面漂亮又练得慢；Isaac Lab 把这两件事捏到了一起。", "wordCount": 1944, "readingMinutes": 6, "tags": [ "manipulation", "locomotion", "RL", "sim2real", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/isaac-lab/", "sourcePath": "https://arxiv.org/abs/2511.04831", "status": "auto-summary-light" }, { "slug": "mujoco-playground", "num": 108, "title": "MuJoCo Playground", "topic": "sim", "topicLabel": "Simulation & Sim2Real", "era": "frontier", "year": 2025, "venue": "arXiv", "difficulty": 3, "tldr": "一个 pip install 就能装好的开源仿真平台，让机器人先在电脑里把走路、抓东西练熟，再几乎原样搬到真机上跑。", "wordCount": 1892, "readingMinutes": 5, "tags": [ "3D", "manipulation", "locomotion", "sim2real" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/mujoco-playground/", "sourcePath": "https://arxiv.org/abs/2502.08844", "status": "auto-summary-light" }, { "slug": "rt-1", "num": 109, "title": "RT-1: Robotics Transformer for Real-World Control at Scale", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "founder", "year": 2022, "venue": "RSS", "difficulty": 3, "tldr": "让机器人看完 13 万段人类亲手示范，就能听一句中文，在真办公室里把可乐罐拿出来放进抽屉。", "wordCount": 6767, "readingMinutes": 19, "tags": [ "transformer", "language", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rt-1/", "sourcePath": "papers/rt-1/paper.pdf", "status": "auto-summary" }, { "slug": "dp3", "num": 110, "title": "3D Diffusion Policy (DP3)", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "classic", "year": 2024, "venue": "RSS", "difficulty": 3, "tldr": "教机器人擦桌子，不给它看照片，改给它看带深度的 3D 点云。结果只用 10 段录像就够学会一个新任务。", "wordCount": 2157, "readingMinutes": 6, "tags": [ "diffusion", "transformer", "3D", "vision", "manipulation", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dp3/", "sourcePath": "https://arxiv.org/abs/2403.03954", "status": "auto-summary-light" }, { "slug": "octo", "num": 111, "title": "Octo: An Open-Source Generalist Robot Policy", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "classic", "year": 2024, "venue": "RSS", "difficulty": 3, "tldr": "第一个真正开源的通用机器人\"大脑\"：先看 80 万段机器人录像学基础动作，你下载回来微调几小时，就能让自家机器人学新活。", "wordCount": 2256, "readingMinutes": 6, "tags": [ "diffusion", "flow-matching", "transformer", "language", "vision", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/octo/", "sourcePath": "https://arxiv.org/abs/2405.12213", "status": "auto-summary-light" }, { "slug": "rt-2", "num": 112, "title": "RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "classic", "year": 2023, "venue": "CoRL", "difficulty": 4, "tldr": "把机器人动作翻译成一句话，让会看图聊天的 AI 用写句子的方式开口指挥机器人——它会写字，就能动手。", "wordCount": 6897, "readingMinutes": 20, "tags": [ "transformer", "language", "vision", "VLA", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rt-2/", "sourcePath": "papers/rt-2/paper.pdf", "status": "auto-summary" }, { "slug": "rt-trajectory", "num": 113, "title": "RT-Trajectory: Robotic Task Generalization via Hindsight Trajectory Sketches", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "classic", "year": 2023, "venue": "ICLR", "difficulty": 3, "tldr": "教机器人做新动作，光说话不够、给一张完成图也不够。这篇论文说：在画面上画一条\"手该走的路\"——机器人立刻照着做。", "wordCount": 6022, "readingMinutes": 17, "tags": [ "diffusion", "transformer", "language", "vision", "manipulation", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rt-trajectory/", "sourcePath": "papers/rt-trajectory/paper.pdf", "status": "auto-summary" }, { "slug": "3d-vla", "num": 114, "title": "3D-VLA", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "frontier", "year": 2024, "venue": "ICML", "difficulty": 4, "tldr": "让机器人除了看平面照片，还能\"摸到\"立体形状；动手前先在脑里画一张\"做完后的样子\"，再照着画面去动。", "wordCount": 2220, "readingMinutes": 6, "tags": [ "diffusion", "transformer", "3D", "language", "vision", "RL" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/3d-vla/", "sourcePath": "https://arxiv.org/abs/2403.09631", "status": "auto-summary-light" }, { "slug": "dexvla", "num": 115, "title": "DexVLA", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "frontier", "year": 2025, "venue": "arXiv", "difficulty": 4, "tldr": "让一个只会\"看图说话\"的大脑别动，给它配一只 10 亿参数的\"专业的手\"。脑负责理解，手负责干活，互不干扰。", "wordCount": 2527, "readingMinutes": 7, "tags": [ "diffusion", "flow-matching", "vision", "manipulation", "VLA", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dexvla/", "sourcePath": "https://arxiv.org/abs/2502.05855", "status": "auto-summary-light" }, { "slug": "gr-2", "num": 116, "title": "GR-2: Generative Video-Language-Action Model", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "frontier", "year": 2024, "venue": "arXiv", "difficulty": 4, "tldr": "让机器人先刷 3800 万段网络视频攒常识，再练动手；它干活时脑子里会\"预演\"下一秒的画面。", "wordCount": 2625, "readingMinutes": 8, "tags": [ "diffusion", "transformer", "language", "RL", "world-model", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/gr-2/", "sourcePath": "https://arxiv.org/abs/2410.06158", "status": "auto-summary-light" }, { "slug": "openhelix", "num": 117, "title": "OpenHelix", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "frontier", "year": 2025, "venue": "arXiv", "difficulty": 3, "tldr": "机器人版的\"大脑加小脑\"分工：大脑慢慢听懂你说的话，小脑飞快动手干活。代码全部开源，对标 Figure 公司不公开的 Helix。", "wordCount": 2535, "readingMinutes": 7, "tags": [ "diffusion", "flow-matching", "transformer", "language", "VLA", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/openhelix/", "sourcePath": "https://arxiv.org/abs/2505.03912", "status": "auto-summary-light" }, { "slug": "openvla-oft", "num": 118, "title": "OpenVLA-OFT", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "frontier", "year": 2025, "venue": "RSS", "difficulty": 3, "tldr": "原版机器人模型一个字一个字念动作，慢还一抖一抖。OpenVLA-OFT 拧开三个开关——一口气说、一段段说、说连续数字——又快又稳。", "wordCount": 2105, "readingMinutes": 6, "tags": [ "diffusion", "transformer", "language", "vision", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/openvla-oft/", "sourcePath": "https://arxiv.org/abs/2502.19645", "status": "auto-summary-light" }, { "slug": "rdt-1b", "num": 119, "title": "RDT-1B: Diffusion Foundation Model for Bimanual Manipulation", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "frontier", "year": 2024, "venue": "ICLR", "difficulty": 4, "tldr": "清华团队给双臂机器人配的\"大脑\"：10 亿参数，听一句话就能让两只机械臂配合着倒水、叠衣服。", "wordCount": 2388, "readingMinutes": 7, "tags": [ "diffusion", "flow-matching", "transformer", "language", "vision", "manipulation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rdt-1b/", "sourcePath": "https://arxiv.org/abs/2410.07864", "status": "auto-summary-light" }, { "slug": "robomamba", "num": 120, "title": "RoboMamba", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "frontier", "year": 2024, "venue": "NeurIPS", "difficulty": 3, "tldr": "机器人脑子原本用 Transformer 拼出来，反应慢、显存吃紧。RoboMamba 换成 Mamba（一种\"流水线式\"架构），让机器人想得更快、更省。", "wordCount": 1977, "readingMinutes": 6, "tags": [ "diffusion", "transformer", "mamba-ssm", "language", "vision", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/robomamba/", "sourcePath": "https://arxiv.org/abs/2406.04339", "status": "auto-summary-light" }, { "slug": "spatialvla", "num": 121, "title": "SpatialVLA", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "frontier", "year": 2025, "venue": "arXiv", "difficulty": 4, "tldr": "教机器人两件事：用普通摄像头也能看出远近；常用动作存成肌肉记忆，不用每次重新算。", "wordCount": 2096, "readingMinutes": 6, "tags": [ "transformer", "3D", "vision", "VLA", "VLM", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/spatialvla/", "sourcePath": "https://arxiv.org/abs/2501.15830", "status": "auto-summary-light" }, { "slug": "tinyvla", "num": 122, "title": "TinyVLA", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "frontier", "year": 2024, "venue": "RA-L", "difficulty": 3, "tldr": "把会听话的机器人大脑瘦身到 1.4B，动作生成换成\"先乱后凿\"的扩散模型，不靠云端也能实时干活。", "wordCount": 2382, "readingMinutes": 7, "tags": [ "diffusion", "flow-matching", "transformer", "language", "vision", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/tinyvla/", "sourcePath": "https://arxiv.org/abs/2409.12514", "status": "auto-summary-light" }, { "slug": "tracevla", "num": 123, "title": "TraceVLA: Visual Trace Prompting", "topic": "vla", "topicLabel": "End-to-End VLA", "era": "frontier", "year": 2024, "venue": "ICLR", "difficulty": 3, "tldr": "机器人的手刚走过哪里？TraceVLA 把这条路径直接画在它看到的照片上，让它看见自己的足迹，再决定下一步往哪动。", "wordCount": 2136, "readingMinutes": 6, "tags": [ "transformer", "mamba-ssm", "language", "vision", "VLA", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/tracevla/", "sourcePath": "https://arxiv.org/abs/2412.10345", "status": "auto-summary-light" }, { "slug": "clip", "num": 124, "title": "Learning Transferable Visual Models From Natural Language Supervision", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "founder", "year": 2021, "venue": "ICML", "difficulty": 3, "tldr": "教 AI 同时认图和认字，把 4 亿对网上图文塞进同一张坐标。之后你说\"一只猫\"，它就能从新图里挑出猫——不用为新任务再训一遍。", "wordCount": 6293, "readingMinutes": 18, "tags": [ "transformer", "language", "vision" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/clip/", "sourcePath": "papers/clip/paper.pdf", "status": "auto-summary" }, { "slug": "flamingo", "num": 125, "title": "Flamingo: a Visual Language Model for Few-Shot Learning", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "founder", "year": 2022, "venue": "NeurIPS", "difficulty": 4, "tldr": "教一个会聊天的 AI 也学会看图，给它看两三个示范，它就能照着做新题。", "wordCount": 6302, "readingMinutes": 18, "tags": [ "transformer", "language", "vision" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/flamingo/", "sourcePath": "papers/flamingo/paper.pdf", "status": "auto-summary" }, { "slug": "blip-2", "num": 126, "title": "BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "classic", "year": 2023, "venue": "ICML", "difficulty": 4, "tldr": "BLIP-2 不动两个大模型——一个负责看图、一个负责说话——只在中间训练一个小\"翻译\"，就让 AI 学会了看图说话。", "wordCount": 2678, "readingMinutes": 8, "tags": [ "transformer", "language", "vision", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/blip-2/", "sourcePath": "https://arxiv.org/abs/2301.12597", "status": "auto-summary-light" }, { "slug": "blip", "num": 127, "title": "BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "classic", "year": 2022, "venue": "ICML", "difficulty": 3, "tldr": "一句话：让一个模型同时学会看图和写字，再让它帮自己把网上烂配文重写干净，回头再用干净数据训一遍——多个任务全线变强。三个关键贡献： MED（Multimodal mixture of Encoder-Decoder）：一个模型三种身份切换——纯编码器、看图的文本编码器、看图的文", "wordCount": 5849, "readingMinutes": 17, "tags": [ "transformer", "language", "vision" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/blip/", "sourcePath": "papers/blip/paper.pdf", "status": "auto-summary" }, { "slug": "deepseek-vl", "num": 128, "title": "DeepSeek-VL: Towards Real-World Vision-Language Understanding", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "classic", "year": 2024, "venue": "arXiv", "difficulty": 3, "tldr": "DeepSeek 在 2024 年开源的\"会看图\"小模型，主打能看清发票、PPT、论文截图里的小字，不只会答考试题。", "wordCount": 2729, "readingMinutes": 8, "tags": [ "language", "vision", "VLM", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/deepseek-vl/", "sourcePath": "https://arxiv.org/abs/2403.05525", "status": "auto-summary-light" }, { "slug": "eva-clip", "num": 129, "title": "EVA-CLIP: Improved Training Techniques for CLIP at Scale", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "classic", "year": 2023, "venue": "arXiv", "difficulty": 3, "tldr": "不改 CLIP 架构，只改训练流程：用一个已经\"懂图\"的视觉模型起步 + 训练时只看半张图——更少数据反而训出更强的看图模型。", "wordCount": 2824, "readingMinutes": 8, "tags": [ "transformer", "vision" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/eva-clip/", "sourcePath": "https://arxiv.org/abs/2303.15389", "status": "auto-summary-light" }, { "slug": "filip", "num": 130, "title": "FILIP: Fine-grained Interactive Language-Image Pre-Training", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "classic", "year": 2022, "venue": "ICLR", "difficulty": 3, "tldr": "以前是\"整张图配整句话\"，FILIP 让图的每一小块和句子的每个词互相找最像的伙伴，模型就能学会\"狗在左下角\"这种细节。", "wordCount": 2695, "readingMinutes": 8, "tags": [ "transformer", "vision" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/filip/", "sourcePath": "https://arxiv.org/abs/2111.07783", "status": "auto-summary-light" }, { "slug": "florence-2", "num": 131, "title": "Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "classic", "year": 2024, "venue": "CVPR", "difficulty": 3, "tldr": "一个看图模型，你跟它说\"圈猫\"\"描述这张图\"\"找红车\"它都能用同一个脑子做，回答全是一段文字。", "wordCount": 2232, "readingMinutes": 6, "tags": [ "transformer", "language", "vision", "VLM", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/florence-2/", "sourcePath": "https://arxiv.org/abs/2311.06242", "status": "auto-summary-light" }, { "slug": "internvl", "num": 132, "title": "InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "classic", "year": 2024, "venue": "CVPR", "difficulty": 4, "tldr": "让\"看图的脑子\"也长到 6B 参数，和\"会说话的脑子\"一样大，AI 看图说话才不偏科，而且开源就能用。", "wordCount": 2239, "readingMinutes": 6, "tags": [ "transformer", "language", "vision", "VLM", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/internvl/", "sourcePath": "https://arxiv.org/abs/2312.14238", "status": "auto-summary-light" }, { "slug": "llava-1-5", "num": 133, "title": "Improved Baselines with Visual Instruction Tuning", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "classic", "year": 2024, "venue": "CVPR", "difficulty": 2, "tldr": "给会聊天的 AI 配一副\"看图眼镜\"。把眼镜从一片镜片换成两片，再多给它看点带字的图片，看图答题就刷榜了。", "wordCount": 2181, "readingMinutes": 6, "tags": [ "language", "vision", "VLM", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/llava-1-5/", "sourcePath": "https://arxiv.org/abs/2310.03744", "status": "auto-summary-light" }, { "slug": "obelics", "num": 134, "title": "OBELICS", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "classic", "year": 2023, "venue": "NeurIPS", "difficulty": 3, "tldr": "HuggingFace 把网上 1.41 亿个\"图文穿插\"的网页洗干净打包开源，让大家也能像 DeepMind 那样训出会看图读长文的模型。", "wordCount": 2181, "readingMinutes": 6, "tags": [ "language", "vision", "VLM", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/obelics/", "sourcePath": "https://arxiv.org/abs/2306.16527", "status": "auto-summary-light" }, { "slug": "qwen-vl", "num": 135, "title": "Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "classic", "year": 2023, "venue": "arXiv", "difficulty": 3, "tldr": "给会聊天的 AI 戴副眼镜：一次学会看图、念中英文招牌、用框指出物体、还能多轮聊天。这就是阿里 2023 年开源的 Qwen-VL。", "wordCount": 2132, "readingMinutes": 6, "tags": [ "language", "vision", "VLM", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/qwen-vl/", "sourcePath": "https://arxiv.org/abs/2308.12966", "status": "auto-summary-light" }, { "slug": "siglip", "num": 136, "title": "Sigmoid Loss for Language Image Pre-Training", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "classic", "year": 2023, "venue": "ICCV", "difficulty": 3, "tldr": "教模型\"图配文字\"，CLIP 要全班一起排名打分，SigLIP 改成一对一判断\"是不是一对\"。算得快、省内存、小批也能学。", "wordCount": 2180, "readingMinutes": 6, "tags": [ "transformer", "vision", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/siglip/", "sourcePath": "https://arxiv.org/abs/2303.15343", "status": "auto-summary-light" }, { "slug": "idefics-2", "num": 137, "title": "What matters when building vision-language models?", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "frontier", "year": 2024, "venue": "NeurIPS", "difficulty": 3, "tldr": "做\"看图说话 AI\"时大家凭感觉选零件，这篇把每个选择拆开做对照实验，整理成一份避坑清单，再训了个 8B 模型当样板。", "wordCount": 2201, "readingMinutes": 6, "tags": [ "transformer", "language", "vision", "VLM", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/idefics-2/", "sourcePath": "https://arxiv.org/abs/2405.02246", "status": "auto-summary-light" }, { "slug": "internvl-2-5", "num": 138, "title": "Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "frontier", "year": 2024, "venue": "arXiv", "difficulty": 4, "tldr": "把模型、数据、推理三件事一起加大，让免费开源的看图模型第一次在大学考试里追上顶级闭源模型。", "wordCount": 2863, "readingMinutes": 8, "tags": [ "language", "vision", "VLM", "dataset", "open-source" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/internvl-2-5/", "sourcePath": "https://arxiv.org/abs/2412.05271", "status": "auto-summary-light" }, { "slug": "llama-3-herd", "num": 139, "title": "The Llama 3 Herd of Models", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "frontier", "year": 2024, "venue": "arXiv", "difficulty": 4, "tldr": "Meta 把训练 Llama 3 大模型的全套\"菜谱\"公开了——用了什么料、多少张卡、跑多久、考多少分。", "wordCount": 1959, "readingMinutes": 6, "tags": [ "transformer", "language", "vision", "RL", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/llama-3-herd/", "sourcePath": "https://arxiv.org/abs/2407.21783", "status": "auto-summary-light" }, { "slug": "llava-next-interleave", "num": 140, "title": "LLaVA-NeXT-Interleave", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "frontier", "year": 2024, "venue": "arXiv", "difficulty": 3, "tldr": "教 AI 像刷图文并茂的小红书：图和字按顺序穿着读，多图、视频、3D 都用这一招，不用各训一个模型。", "wordCount": 2328, "readingMinutes": 7, "tags": [ "3D", "language", "vision", "VLM", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/llava-next-interleave/", "sourcePath": "https://arxiv.org/abs/2407.07895", "status": "auto-summary-light" }, { "slug": "llava-onevision", "num": 141, "title": "LLaVA-OneVision: Easy Visual Task Transfer", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "frontier", "year": 2024, "venue": "arXiv", "difficulty": 3, "tldr": "一套配方教会一个模型同时看懂单张图、几张图、和视频，开源圈第一次在视频上接近 GPT-4V。", "wordCount": 1987, "readingMinutes": 6, "tags": [ "language", "vision", "VLA", "VLM", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/llava-onevision/", "sourcePath": "https://arxiv.org/abs/2408.03326", "status": "auto-summary-light" }, { "slug": "long-clip", "num": 142, "title": "Long-CLIP: Unlocking the Long-Text Capability of CLIP", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "frontier", "year": 2024, "venue": "ECCV", "difficulty": 3, "tldr": "给只能读 77 字短纸条的 CLIP 做两个小手术，让它能读 248 字的长纸条，但又没忘掉原来认识的那些短词。", "wordCount": 2395, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "language", "vision", "VLM" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/long-clip/", "sourcePath": "https://arxiv.org/abs/2403.15378", "status": "auto-summary-light" }, { "slug": "pixtral-12b", "num": 143, "title": "Pixtral 12B", "topic": "vlm-foundation", "topicLabel": "VLM Foundation", "era": "frontier", "year": 2024, "venue": "arXiv", "difficulty": 3, "tldr": "Mistral 开源的\"会看图聊天的助手\"——从一开始就同时学看图和说话，图想多大就多大，能免费拿去做产品。", "wordCount": 2068, "readingMinutes": 6, "tags": [ "transformer", "language", "vision", "VLM", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/pixtral-12b/", "sourcePath": "https://arxiv.org/abs/2410.07073", "status": "auto-summary-light" }, { "slug": "dreamer-v1", "num": 144, "title": "Dream to Control: Learning Behaviors by Latent Imagination", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "founder", "year": 2020, "venue": "ICLR", "difficulty": 4, "tldr": "教 AI 在脑子里反复\"做白日梦\"演练动作，不用真去摔跤，就能学会跑步、翻跟头这种复杂动作。", "wordCount": 6449, "readingMinutes": 18, "tags": [ "mamba-ssm", "RL", "world-model" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dreamer-v1/", "sourcePath": "papers/dreamer-v1/paper.pdf", "status": "auto-summary" }, { "slug": "world-models-ha", "num": 145, "title": "World Models", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "founder", "year": 2018, "venue": "NeurIPS", "difficulty": 3, "tldr": "让 AI 先在自己脑子里反复\"做白日梦\"练打游戏，练熟了再去真游戏里上场——居然真能赢。", "wordCount": 6473, "readingMinutes": 18, "tags": [ "RL", "world-model" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/world-models-ha/", "sourcePath": "papers/world-models-ha/paper.pdf", "status": "auto-summary" }, { "slug": "daydreamer", "num": 146, "title": "DayDreamer", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "classic", "year": 2022, "venue": "CoRL", "difficulty": 3, "tldr": "让一只四足机器人不靠仿真，在真实世界里 1 小时就学会走路——靠的是边走边在脑子里\"做梦\"演练。", "wordCount": 2994, "readingMinutes": 9, "tags": [ "mamba-ssm", "vision", "RL", "imitation", "world-model", "sim2real" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/daydreamer/", "sourcePath": "https://arxiv.org/abs/2206.14176", "status": "auto-summary-light" }, { "slug": "dreamer-v2", "num": 147, "title": "Mastering Atari with Discrete World Models", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "classic", "year": 2021, "venue": "ICLR", "difficulty": 4, "tldr": "让 AI 闭眼\"做白日梦\"练打老游戏，第一次只靠脑子里想象就打到人类水平。", "wordCount": 6048, "readingMinutes": 17, "tags": [ "mamba-ssm", "vision", "RL", "imitation", "world-model" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dreamer-v2/", "sourcePath": "papers/dreamer-v2/paper.pdf", "status": "auto-summary" }, { "slug": "dreamer-v3", "num": 148, "title": "Dreamer V3: Mastering Diverse Domains through World Models", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "classic", "year": 2025, "venue": "Nature", "difficulty": 4, "tldr": "同一套设置，让一个 AI 自己玩 150 多种游戏都不用改参数，还第一次靠自己挖到《我的世界》里的钻石。", "wordCount": 2507, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "mamba-ssm", "RL", "world-model", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dreamer-v3/", "sourcePath": "https://arxiv.org/abs/2301.04104", "status": "auto-summary-light" }, { "slug": "iris-world-model", "num": 149, "title": "Transformers are Sample-Efficient World Models", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "classic", "year": 2023, "venue": "ICLR", "difficulty": 4, "tldr": "把游戏画面切成一格格\"积木\"，让 AI 像写句子一样接龙下一帧，然后让它在脑子里\"自己跟自己玩\"练强化学习——只玩两小时就接近人类水平。", "wordCount": 2446, "readingMinutes": 7, "tags": [ "transformer", "mamba-ssm", "language", "RL", "world-model" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/iris-world-model/", "sourcePath": "https://arxiv.org/abs/2209.00588", "status": "auto-summary-light" }, { "slug": "transformer-world-model", "num": 150, "title": "TWM: Transformer-based World Models", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "classic", "year": 2023, "venue": "ICLR", "difficulty": 4, "tldr": "agent 在脑子里\"做梦\"练本事。这篇把梦的引擎从 RNN 换成 Transformer，记得更长，做得更准。", "wordCount": 2258, "readingMinutes": 6, "tags": [ "transformer", "mamba-ssm", "language", "RL", "world-model", "dataset" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/transformer-world-model/", "sourcePath": "https://arxiv.org/abs/2303.07109", "status": "auto-summary-light" }, { "slug": "1x-world-model-2025", "num": 151, "title": "1X World Model Challenge", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "frontier", "year": 2025, "venue": "arXiv", "difficulty": 3, "tldr": "1X 教人形机器人 Neo \"脑补下一秒画面\"：拿现成视频 AI 当底子，喂自家机器人录像微调，再做成公开赛让大家来卷。", "wordCount": 2600, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "language", "RL", "imitation", "world-model" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/1x-world-model-2025/", "sourcePath": "https://arxiv.org/abs/2510.07092", "status": "auto-summary-light" }, { "slug": "cosmos-world-foundation", "num": 152, "title": "Cosmos World Foundation Model Platform", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "frontier", "year": 2025, "venue": "arXiv", "difficulty": 5, "tldr": "NVIDIA 用 2000 万小时真实视频，训了一个能\"猜下一秒物理世界长啥样\"的大模型，给机器人和无人车当通用底座。", "wordCount": 2284, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "language", "RL", "world-model", "VLA" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/cosmos-world-foundation/", "sourcePath": "https://arxiv.org/abs/2501.03575", "status": "auto-summary-light" }, { "slug": "gaia-1", "num": 153, "title": "GAIA-1", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "frontier", "year": 2023, "venue": "arXiv", "difficulty": 4, "tldr": "GAIA-1 是个会做梦的开车模拟器：给它一段街景视频的开头加一句\"我现在打方向盘\"，它能接着画出后面几秒街上看到的画面。", "wordCount": 2288, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "language", "vision", "RL", "imitation" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/gaia-1/", "sourcePath": "https://arxiv.org/abs/2309.17080", "status": "auto-summary-light" }, { "slug": "genie", "num": 154, "title": "Genie: Generative Interactive Environments", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "frontier", "year": 2024, "venue": "ICML", "difficulty": 4, "tldr": "Genie 看一堆游戏录屏，自己猜出每帧之间\"按了什么键\"，再用这个\"按键\"画出下一帧——把死视频变成能玩的小游戏。", "wordCount": 2186, "readingMinutes": 6, "tags": [ "diffusion", "transformer", "vision", "RL", "imitation", "world-model" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/genie/", "sourcePath": "https://arxiv.org/abs/2402.15391", "status": "auto-summary-light" }, { "slug": "navigation-world-models", "num": 155, "title": "Navigation World Models", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "frontier", "year": 2025, "venue": "CVPR", "difficulty": 4, "tldr": "让机器人\"走\"之前先在脑子里放一段未来几秒的画面，看会不会撞墙，再决定真的怎么走。", "wordCount": 2468, "readingMinutes": 7, "tags": [ "diffusion", "transformer", "navigation", "RL", "world-model" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/navigation-world-models/", "sourcePath": "https://arxiv.org/abs/2412.03572", "status": "auto-summary-light" }, { "slug": "unisim", "num": 156, "title": "UniSim", "topic": "world-model", "topicLabel": "World Model & Video Policy", "era": "frontier", "year": 2024, "venue": "ICLR", "difficulty": 4, "tldr": "看过海量视频后，你给它一个动作（说一句话 / 推一下机械臂 / 挪一下镜头），它就生成接下来世界长什么样的视频——像一台会脑补现实的\"游戏机\"。", "wordCount": 2530, "readingMinutes": 7, "tags": [ "diffusion", "vision", "world-model", "VLM", "sim2real" ], "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/unisim/", "sourcePath": "https://arxiv.org/abs/2310.06114", "status": "auto-summary-light" } ]