[
  {
    "slug": "llava",
    "num": 1,
    "title": "LLaVA: Visual Instruction Tuning",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "founder",
    "year": 2023,
    "venue": "NeurIPS",
    "difficulty": 2,
    "tldr": "给一个只会打字聊天的 AI 装上眼睛——你随手拍张照片发过去，它能看着图陪你说话。",
    "wordCount": 6046,
    "readingMinutes": 17,
    "tags": [
      "language",
      "vision",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/llava/",
    "sourcePath": "papers/llava/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "3dshape2vecset",
    "num": 2,
    "title": "3DShape2VecSet: 3D Shape Representation for Diffusion Models",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "classic",
    "year": 2023,
    "venue": "SIGGRAPH",
    "difficulty": 4,
    "tldr": "把一只 3D 柯基拆成 512 张小卡片；电脑学会卡片的规律，就能凭空造出新的 3D 模型。",
    "wordCount": 6224,
    "readingMinutes": 18,
    "tags": [
      "diffusion",
      "3D"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/3dshape2vecset/",
    "sourcePath": "papers/3dshape2vecset/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "saycan",
    "num": 3,
    "title": "SayCan: Do As I Can, Not As I Say",
    "topic": "planning",
    "topicLabel": "High-Level Planning",
    "era": "founder",
    "year": 2022,
    "venue": "CoRL",
    "difficulty": 2,
    "tldr": "让\"见多识广但出不了门的 AI\"出主意，让机器人自己摸口袋说\"这事我现在能做\"，两边都点头才动手。",
    "wordCount": 4946,
    "readingMinutes": 14,
    "tags": [
      "language",
      "RL"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/saycan/",
    "sourcePath": "papers/saycan/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "openvla",
    "num": 4,
    "title": "OpenVLA: An Open-Source Vision-Language-Action Model",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "classic",
    "year": 2024,
    "venue": "CoRL",
    "difficulty": 3,
    "tldr": "把一个会\"看图说话\"的 AI 改一改，让它学会\"看一眼桌面就动手摆东西\"，再把全部训练配方开源送出去。",
    "wordCount": 5383,
    "readingMinutes": 15,
    "tags": [
      "language",
      "vision",
      "VLA",
      "open-source"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/openvla/",
    "sourcePath": "papers/openvla/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "vlas",
    "num": 5,
    "title": "VLAS: VLA Model With Speech Instructions",
    "topic": "multimodal",
    "topicLabel": "Multimodal Ecology",
    "era": "frontier",
    "year": 2025,
    "venue": "ICLR",
    "difficulty": 3,
    "tldr": "机器人直接听原声干活：光凭你的嗓音就认出\"是你在说话\"，再去拿你那只专属的杯子。",
    "wordCount": 5570,
    "readingMinutes": 16,
    "tags": [
      "language",
      "audio-speech",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/vlas/",
    "sourcePath": "papers/vlas/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "mla",
    "num": 6,
    "title": "MLA: Multisensory Language-Action Model",
    "topic": "multimodal",
    "topicLabel": "Multimodal Ecology",
    "era": "frontier",
    "year": 2024,
    "venue": "arXiv",
    "difficulty": 4,
    "tldr": "让机器人不只用眼睛看，还会用\"手感\"和\"空间感\"，并且提前猜下一秒发生什么再动手。",
    "wordCount": 5636,
    "readingMinutes": 16,
    "tags": [
      "3D",
      "language",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/mla/",
    "sourcePath": "papers/mla/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "cosmos-policy",
    "num": 7,
    "title": "Cosmos Policy: Fine-Tuning Video Models for Visuomotor Control",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "frontier",
    "year": 2025,
    "venue": "arXiv",
    "difficulty": 5,
    "tldr": "把一个会\"脑补下一秒视频\"的大模型，再练一遍，就能让它指挥机械臂做家务。",
    "wordCount": 5888,
    "readingMinutes": 17,
    "tags": [
      "diffusion",
      "world-model",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/cosmos-policy/",
    "sourcePath": "papers/cosmos-policy/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "rf-slam",
    "num": 8,
    "title": "CartoRadar: RF-Based 3D SLAM Rivaling Vision Approaches",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "classic",
    "year": 2023,
    "venue": "MobiCom 2025 (Best Artifact Award)",
    "difficulty": 4,
    "tldr": "给机器人装一颗几百块的小雷达，哪怕屋里又黑又有烟，它也能一边走一边画出准的 3D 地图，比用相机还清楚。",
    "wordCount": 6567,
    "readingMinutes": 19,
    "tags": [
      "3D",
      "RF-radar",
      "navigation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rf-slam/",
    "sourcePath": "papers/rf-slam/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "mmclip",
    "num": 9,
    "title": "mmCLIP: Boosting mmWave-based Zero-shot HAR via Signal-Text Alignment",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "classic",
    "year": 2024,
    "venue": "SenSys 2024",
    "difficulty": 4,
    "tldr": "教一种\"看不见脸\"的小盒子雷达，没学过的新动作也能猜个八九不离十——比如老人半夜在黑卧室摔倒，它能感知到。",
    "wordCount": 6251,
    "readingMinutes": 18,
    "tags": [
      "language",
      "vision",
      "RF-radar",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/mmclip/",
    "sourcePath": "papers/mmclip/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "nlos-mmwave",
    "num": 10,
    "title": "mmNorm: Non-Line-of-Sight 3D Object Reconstruction via mmWave Surface Normal Estimation",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "frontier",
    "year": 2023,
    "venue": "MobiSys 2025",
    "difficulty": 4,
    "tldr": "不直接问\"东西在哪儿\"，而是先猜\"它的皮朝哪边翘\"——雷达就能隔着纸箱看出里面是什么形状。",
    "wordCount": 5693,
    "readingMinutes": 16,
    "tags": [
      "3D",
      "RF-radar"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/nlos-mmwave/",
    "sourcePath": "papers/nlos-mmwave/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "proactive-hearing",
    "num": 11,
    "title": "Proactive Hearing Assistants that Isolate Egocentric Conversations",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "frontier",
    "year": 2024,
    "venue": "UIST",
    "difficulty": 3,
    "tldr": "戴上这副耳机，它自己听出\"现在你在跟谁聊天\"，把同伴的声音放大、其他人压下去，你一个按钮都不用按。",
    "wordCount": 5847,
    "readingMinutes": 17,
    "tags": [
      "audio-speech"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/proactive-hearing/",
    "sourcePath": "papers/proactive-hearing/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "neuralaids",
    "num": 12,
    "title": "NeuralAids: Wireless Hearables With Programmable Speech AI Accelerators",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "classic",
    "year": 2024,
    "venue": "MobiCom",
    "difficulty": 3,
    "tldr": "在咖啡馆听不清对面说话？让助听器自己降噪，不连手机、不连云。",
    "wordCount": 5527,
    "readingMinutes": 16,
    "tags": [
      "audio-speech"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/neuralaids/",
    "sourcePath": "papers/neuralaids/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "acoustic-swarms",
    "num": 13,
    "title": "Creating speech zones with self-distributing acoustic swarms",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "founder",
    "year": 2023,
    "venue": "Nature",
    "difficulty": 3,
    "tldr": "七个像骰子那么大的小机器人，自己爬上桌散成一圈，桌上几个人同时讲话，它能分清谁说了啥。",
    "wordCount": 6653,
    "readingMinutes": 19,
    "tags": [
      "audio-speech",
      "navigation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/acoustic-swarms/",
    "sourcePath": "papers/acoustic-swarms/paper.md",
    "status": "auto-summary"
  },
  {
    "slug": "conv-tasnet",
    "num": 14,
    "title": "Conv-TasNet: Surpassing Ideal Time-Frequency Magnitude Masking for Speech Separation",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "founder",
    "year": 2019,
    "venue": "IEEE/ACM TASLP",
    "difficulty": 3,
    "tldr": "两人同时讲话的混音，喂给一个网络，它能把每个人的声音分别还原。比老方法（看频谱图）更准、更快、更小。",
    "wordCount": 6834,
    "readingMinutes": 20,
    "tags": [
      "audio-speech"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/conv-tasnet/",
    "sourcePath": "papers/conv-tasnet/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "soundstream",
    "num": 15,
    "title": "SoundStream: An End-to-End Neural Audio Codec",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "founder",
    "year": 2022,
    "venue": "IEEE/ACM TASLP",
    "difficulty": 4,
    "tldr": "让 AI 自己学怎么把声音\"打包又拆开\"，3 kbps 的小包听起来反而比传统方案 12 kbps 还清楚。",
    "wordCount": 6276,
    "readingMinutes": 18,
    "tags": [
      "audio-speech"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/soundstream/",
    "sourcePath": "papers/soundstream/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "audiolm",
    "num": 16,
    "title": "AudioLM",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "classic",
    "year": 2023,
    "venue": "TASLP",
    "difficulty": 4,
    "tldr": "把声音切成两种\"音频字\"——一种管说啥、一种管音色，模型像写句子一样续写，给 3 秒就能接出像本人的语音。",
    "wordCount": 2976,
    "readingMinutes": 9,
    "tags": [
      "transformer",
      "language",
      "audio-speech"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/audiolm/",
    "sourcePath": "https://arxiv.org/abs/2209.03143",
    "status": "auto-summary-light"
  },
  {
    "slug": "conformer",
    "num": 17,
    "title": "Conformer",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "classic",
    "year": 2020,
    "venue": "Interspeech",
    "difficulty": 3,
    "tldr": "让 AI 听人说话时既能听清每个字的咬字，又能联系整段话的意思——一个会同时\"听细节\"和\"听大意\"的耳朵。",
    "wordCount": 2209,
    "readingMinutes": 6,
    "tags": [
      "transformer",
      "vision",
      "audio-speech"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/conformer/",
    "sourcePath": "https://arxiv.org/abs/2005.08100",
    "status": "auto-summary-light"
  },
  {
    "slug": "dprnn",
    "num": 18,
    "title": "Dual-path RNN",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "classic",
    "year": 2020,
    "venue": "ICASSP",
    "difficulty": 4,
    "tldr": "DPRNN 把超长录音切成小块，让 RNN 先在块里跑、再跨块跑，交替几轮就能把两个人同时说话拆开。",
    "wordCount": 2615,
    "readingMinutes": 7,
    "tags": [
      "transformer",
      "audio-speech",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dprnn/",
    "sourcePath": "https://arxiv.org/abs/1910.06379",
    "status": "auto-summary-light"
  },
  {
    "slug": "encodec",
    "num": 19,
    "title": "EnCodec",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "classic",
    "year": 2023,
    "venue": "TMLR",
    "difficulty": 4,
    "tldr": "EnCodec 把声音压成一串很小的数字再还原回来；既比老办法省流量，又因为是数字，AI 可以像写字一样\"写\"出声音。",
    "wordCount": 2701,
    "readingMinutes": 8,
    "tags": [
      "transformer",
      "language",
      "audio-speech"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/encodec/",
    "sourcePath": "https://arxiv.org/abs/2210.13438",
    "status": "auto-summary-light"
  },
  {
    "slug": "meta-stylespeech",
    "num": 20,
    "title": "Meta-StyleSpeech",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "classic",
    "year": 2021,
    "venue": "ICML",
    "difficulty": 3,
    "tldr": "给模型听几秒陌生人说话的录音，它就能用这个人的声音念任意一句话。不用重新训练、不用收集几小时数据——几秒就够。",
    "wordCount": 2329,
    "readingMinutes": 7,
    "tags": [
      "transformer",
      "audio-speech",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/meta-stylespeech/",
    "sourcePath": "https://arxiv.org/abs/2106.03153",
    "status": "auto-summary-light"
  },
  {
    "slug": "musiclm",
    "num": 21,
    "title": "MusicLM",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "classic",
    "year": 2023,
    "venue": "arXiv",
    "difficulty": 4,
    "tldr": "对着模型说一句\"缓慢爵士钢琴配鼓刷\"，它就生成几分钟真实音乐——先定骨架（结构），再填细节（音色）。",
    "wordCount": 2524,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "vision",
      "audio-speech",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/musiclm/",
    "sourcePath": "https://arxiv.org/abs/2301.11325",
    "status": "auto-summary-light"
  },
  {
    "slug": "whisper",
    "num": 22,
    "title": "Robust Speech Recognition via Large-Scale Weak Supervision",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "classic",
    "year": 2023,
    "venue": "ICML",
    "difficulty": 3,
    "tldr": "Whisper 把网上 68 万小时音频和字幕一锅烩，喂进普通 Transformer，开箱就能听各种口音、噪声和长录音，还顺手翻译——靠数据杂取胜。",
    "wordCount": 7515,
    "readingMinutes": 21,
    "tags": [
      "transformer",
      "language",
      "vision",
      "audio-speech"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/whisper/",
    "sourcePath": "papers/whisper/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "seamless-m4t",
    "num": 23,
    "title": "SeamlessM4T",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "frontier",
    "year": 2023,
    "venue": "arXiv",
    "difficulty": 4,
    "tldr": "*一个模型搞定 100 种语言的\"听懂、翻译、说出来\"，省掉以前三四个 App 接力的麻烦。 它一口气会做 5 件事，名字像缩写但其实只是\"输入 → 输出\"的简写： ASR（Automatic Speech Recognition，语音识别）：听写成同语言文字 S2T（Speec",
    "wordCount": 2443,
    "readingMinutes": 7,
    "tags": [
      "transformer",
      "language",
      "audio-speech"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/seamless-m4t/",
    "sourcePath": "https://arxiv.org/abs/2308.11596",
    "status": "auto-summary-light"
  },
  {
    "slug": "stable-audio",
    "num": 24,
    "title": "Stable Audio",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "frontier",
    "year": 2024,
    "venue": "ICML",
    "difficulty": 4,
    "tldr": "打几个字描述你想要的声音，AI 就能做出几十秒到一两分钟的高音质音乐或音效，长度还能精确到秒。",
    "wordCount": 2376,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "vision",
      "audio-speech",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/stable-audio/",
    "sourcePath": "https://arxiv.org/abs/2402.04825",
    "status": "auto-summary-light"
  },
  {
    "slug": "uss-weakly-labelled",
    "num": 25,
    "title": "Universal Source Separation with Weakly Labelled Data",
    "topic": "auditory",
    "topicLabel": "Auditory & Acoustic",
    "era": "frontier",
    "year": 2024,
    "venue": "TASLP",
    "difficulty": 4,
    "tldr": "给电脑一段嘈杂录音，告诉它\"我只要狗叫\"，它就把狗叫从混音里抠出来。一个模型覆盖 527 类日常声音。",
    "wordCount": 2366,
    "readingMinutes": 7,
    "tags": [
      "vision",
      "audio-speech",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/uss-weakly-labelled/",
    "sourcePath": "https://arxiv.org/abs/2305.07447",
    "status": "auto-summary-light"
  },
  {
    "slug": "meta-world",
    "num": 26,
    "title": "Meta-World: A Benchmark and Evaluation for Multi-Task and Meta Reinforcement Learning",
    "topic": "dataset-eval",
    "topicLabel": "Datasets & Benchmarks",
    "era": "founder",
    "year": 2019,
    "venue": "CoRL",
    "difficulty": 2,
    "tldr": "给那些号称\"会举一反三\"的机器人算法办一场 50 道动手题的统一考试，看它们是不是真的会。",
    "wordCount": 5943,
    "readingMinutes": 17,
    "tags": [
      "manipulation",
      "navigation",
      "RL",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/meta-world/",
    "sourcePath": "papers/meta-world/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "rlbench",
    "num": 27,
    "title": "RLBench: The Robot Learning Benchmark & Learning Environment",
    "topic": "dataset-eval",
    "topicLabel": "Datasets & Benchmarks",
    "era": "founder",
    "year": 2019,
    "venue": "RA-L",
    "difficulty": 2,
    "tldr": "给机器人手臂出了一套 100 道题的\"统考卷\"，从此大家都做同一套题，第一次能公平比谁更厉害。",
    "wordCount": 7554,
    "readingMinutes": 22,
    "tags": [
      "vision",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rlbench/",
    "sourcePath": "papers/rlbench/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "robosuite",
    "num": 28,
    "title": "robosuite: A Modular Simulation Framework and Benchmark for Robot Learning",
    "topic": "dataset-eval",
    "topicLabel": "Datasets & Benchmarks",
    "era": "founder",
    "year": 2020,
    "venue": "arXiv",
    "difficulty": 2,
    "tldr": "robosuite 是机器人 AI 的\"标准考场\"——同一台仿真机械臂、同一组题目，让全球研究者公平地比谁的算法更聪明。",
    "wordCount": 7448,
    "readingMinutes": 21,
    "tags": [
      "manipulation",
      "RL",
      "imitation",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/robosuite/",
    "sourcePath": "papers/robosuite/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "bridgedata-v2",
    "num": 29,
    "title": "BridgeData V2",
    "topic": "dataset-eval",
    "topicLabel": "Datasets & Benchmarks",
    "era": "classic",
    "year": 2023,
    "venue": "dataset-eval",
    "difficulty": 2,
    "tldr": "BridgeData V2 是一份公开的\"机器人干活录像库\"——6 万段机械臂在 24 个真实场景里的演示视频，大家训机器人时把它当共同起跑线。",
    "wordCount": 2371,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "vision",
      "imitation",
      "VLA",
      "sim2real"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/bridgedata-v2/",
    "sourcePath": "https://arxiv.org/abs/2308.12952",
    "status": "auto-summary-light"
  },
  {
    "slug": "calvin",
    "num": 30,
    "title": "CALVIN",
    "topic": "dataset-eval",
    "topicLabel": "Datasets & Benchmarks",
    "era": "classic",
    "year": 2022,
    "venue": "RA-L",
    "difficulty": 3,
    "tldr": "CALVIN 是一把\"机器人听话考试\"的尺子：人说一段话，机器人要在桌上一步接一步把活干完，34 个小任务统一打分。",
    "wordCount": 2327,
    "readingMinutes": 7,
    "tags": [
      "vision",
      "manipulation",
      "imitation",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/calvin/",
    "sourcePath": "https://arxiv.org/abs/2112.03227",
    "status": "auto-summary-light"
  },
  {
    "slug": "libero",
    "num": 31,
    "title": "LIBERO",
    "topic": "dataset-eval",
    "topicLabel": "Datasets & Benchmarks",
    "era": "classic",
    "year": 2023,
    "venue": "NeurIPS",
    "difficulty": 3,
    "tldr": "教机器人学新技能时别忘旧技能。LIBERO 是这事的标准考卷，4 套题分别考空间、物体、目标和综合。",
    "wordCount": 2326,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "vision",
      "imitation",
      "VLA",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/libero/",
    "sourcePath": "https://arxiv.org/abs/2306.03310",
    "status": "auto-summary-light"
  },
  {
    "slug": "rh20t",
    "num": 32,
    "title": "RH20T",
    "topic": "dataset-eval",
    "topicLabel": "Datasets & Benchmarks",
    "era": "classic",
    "year": 2023,
    "venue": "RSS Workshop",
    "difficulty": 3,
    "tldr": "机器人数据集，除拍视频外还录了\"手感\"和\"声音\"：拧瓶盖多大力、咔哒卡到位。147 项任务、11 万段。",
    "wordCount": 2079,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "vision",
      "manipulation",
      "imitation",
      "sim2real",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rh20t/",
    "sourcePath": "https://arxiv.org/abs/2307.00595",
    "status": "auto-summary-light"
  },
  {
    "slug": "robomimic",
    "num": 33,
    "title": "What Matters in Learning from Offline Human Demonstrations for Robot Manipulation",
    "topic": "dataset-eval",
    "topicLabel": "Datasets & Benchmarks",
    "era": "classic",
    "year": 2021,
    "venue": "CoRL",
    "difficulty": 3,
    "tldr": "这篇不发明新算法，而是把\"机器人看录像学操作\"里每个变量挨个换一遍，告诉你哪些真有用、哪些是白忙。",
    "wordCount": 2071,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "manipulation",
      "RL",
      "imitation",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/robomimic/",
    "sourcePath": "https://arxiv.org/abs/2108.03298",
    "status": "auto-summary-light"
  },
  {
    "slug": "droid",
    "num": 34,
    "title": "DROID",
    "topic": "dataset-eval",
    "topicLabel": "Datasets & Benchmarks",
    "era": "frontier",
    "year": 2024,
    "venue": "RSS",
    "difficulty": 3,
    "tldr": "全球 18 家实验室一起拍机器人干活的视频，凑出 7.6 万段、564 个真实场景，让机器人不再只会\"自家桌子上那点活\"。",
    "wordCount": 2308,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "language",
      "vision",
      "manipulation",
      "RL"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/droid/",
    "sourcePath": "https://arxiv.org/abs/2403.12945",
    "status": "auto-summary-light"
  },
  {
    "slug": "open-x-embodiment",
    "num": 35,
    "title": "Open X-Embodiment",
    "topic": "dataset-eval",
    "topicLabel": "Datasets & Benchmarks",
    "era": "frontier",
    "year": 2023,
    "venue": "ICRA",
    "difficulty": 3,
    "tldr": "22 家实验室把各种机器人的\"练手视频\"凑成一个大数据集，再训一个通吃模型，发现喂多种机器人比单喂一种学得更好。",
    "wordCount": 2646,
    "readingMinutes": 8,
    "tags": [
      "transformer",
      "language",
      "RL",
      "imitation",
      "VLA",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/open-x-embodiment/",
    "sourcePath": "https://arxiv.org/abs/2310.08864",
    "status": "auto-summary-light"
  },
  {
    "slug": "robocasa",
    "num": 36,
    "title": "RoboCasa",
    "topic": "dataset-eval",
    "topicLabel": "Datasets & Benchmarks",
    "era": "frontier",
    "year": 2024,
    "venue": "RSS",
    "difficulty": 3,
    "tldr": "想造个会做饭的家用机器人？RoboCasa 给你 120 个虚拟厨房、100 个小动作、十万次练习录像，让它先在游戏里练会，再上岗。",
    "wordCount": 2198,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "manipulation",
      "navigation",
      "imitation",
      "VLA",
      "sim2real"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/robocasa/",
    "sourcePath": "https://arxiv.org/abs/2406.02523",
    "status": "auto-summary-light"
  },
  {
    "slug": "simpler-env",
    "num": 37,
    "title": "SimplerEnv",
    "topic": "dataset-eval",
    "topicLabel": "Datasets & Benchmarks",
    "era": "frontier",
    "year": 2024,
    "venue": "NeurIPS",
    "difficulty": 4,
    "tldr": "不用搬真机器人，在电脑里就能给 VLA（视觉-语言-动作模型）打分，分数和真机几乎一样准。",
    "wordCount": 2280,
    "readingMinutes": 7,
    "tags": [
      "3D",
      "vision",
      "manipulation",
      "RL",
      "VLA",
      "sim2real"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/simpler-env/",
    "sourcePath": "https://arxiv.org/abs/2405.05941",
    "status": "auto-summary-light"
  },
  {
    "slug": "diffusion-policy",
    "num": 38,
    "title": "Diffusion Policy: Visuomotor Policy Learning via Action Diffusion",
    "topic": "diffusion-policy",
    "topicLabel": "Diffusion Policy",
    "era": "founder",
    "year": 2023,
    "venue": "RSS",
    "difficulty": 3,
    "tldr": "让机器人像调电视雪花一样产生动作：从满屏乱码开始，擦几下，下一步该怎么动就擦出来了。",
    "wordCount": 6554,
    "readingMinutes": 19,
    "tags": [
      "diffusion",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/diffusion-policy/",
    "sourcePath": "papers/diffusion-policy/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "3d-diffusion-policy",
    "num": 39,
    "title": "3D Diffusion Policy: Generalizable Visuomotor Policy Learning via Simple 3D Representations",
    "topic": "diffusion-policy",
    "topicLabel": "Diffusion Policy",
    "era": "classic",
    "year": 2024,
    "venue": "RSS",
    "difficulty": 3,
    "tldr": "让机器人改看 3D 立体形状（点云）而不是 2D 照片来学动作，10 条示范就够，72 个任务平均比原版强 24.2%。",
    "wordCount": 5167,
    "readingMinutes": 15,
    "tags": [
      "diffusion",
      "transformer",
      "3D",
      "vision",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/3d-diffusion-policy/",
    "sourcePath": "papers/3d-diffusion-policy/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "consistency-policy",
    "num": 40,
    "title": "Consistency Policy: Accelerated Visuomotor Policies via Consistency Distillation",
    "topic": "diffusion-policy",
    "topicLabel": "Diffusion Policy",
    "era": "classic",
    "year": 2024,
    "venue": "RSS",
    "difficulty": 3,
    "tldr": "机器人选下一步动作本来要慢慢搅 100 下才出一步，这篇教它一下就跳到答案——快约十倍，连笔记本都跑得动。",
    "wordCount": 6177,
    "readingMinutes": 18,
    "tags": [
      "diffusion",
      "flow-matching",
      "transformer",
      "RF-radar",
      "navigation",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/consistency-policy/",
    "sourcePath": "papers/consistency-policy/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "equibot",
    "num": 41,
    "title": "EquiBot: SIM(3)-Equivariant Diffusion Policy",
    "topic": "diffusion-policy",
    "topicLabel": "Diffusion Policy",
    "era": "classic",
    "year": 2024,
    "venue": "CoRL",
    "difficulty": 4,
    "tldr": "教机器人几次就够了。挪位置、转方向、换大小都不用重学，因为这件事直接焊在网络结构里。",
    "wordCount": 2409,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "3D",
      "vision",
      "manipulation",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/equibot/",
    "sourcePath": "https://arxiv.org/abs/2407.01479",
    "status": "auto-summary-light"
  },
  {
    "slug": "dit-policy",
    "num": 42,
    "title": "DiT-Policy",
    "topic": "diffusion-policy",
    "topicLabel": "Diffusion Policy",
    "era": "frontier",
    "year": 2025,
    "venue": "ICRA",
    "difficulty": 4,
    "tldr": "把画图领域火起来的新骨架（DiT）搬到机器人身上，再把每个零件挨个拆开看，到底哪个让它真变好。",
    "wordCount": 2385,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "flow-matching",
      "transformer",
      "vision",
      "manipulation",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dit-policy/",
    "sourcePath": "https://arxiv.org/abs/2410.10088",
    "status": "auto-summary-light"
  },
  {
    "slug": "dppo",
    "num": 43,
    "title": "Diffusion Policy Policy Optimization (DPPO)",
    "topic": "diffusion-policy",
    "topicLabel": "Diffusion Policy",
    "era": "frontier",
    "year": 2025,
    "venue": "ICLR",
    "difficulty": 4,
    "tldr": "先模仿老师傅、再自己练。DPPO 把\"自己练\"那步拆成很多小动作，让常规 RL 也能调教扩散策略。",
    "wordCount": 2447,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "3D",
      "vision",
      "RL",
      "imitation",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dppo/",
    "sourcePath": "https://arxiv.org/abs/2409.00588",
    "status": "auto-summary-light"
  },
  {
    "slug": "flow-matching-manipulation",
    "num": 44,
    "title": "Affordance-based Robot Manipulation with Flow Matching",
    "topic": "diffusion-policy",
    "topicLabel": "Diffusion Policy",
    "era": "frontier",
    "year": 2024,
    "venue": "IROS",
    "difficulty": 3,
    "tldr": "教机器人做事时，先让它看懂物体能怎么用，再用一种\"画直线\"式的方法直接生成动作——比扩散模型更快更稳。",
    "wordCount": 2567,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "flow-matching",
      "transformer",
      "vision",
      "manipulation",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/flow-matching-manipulation/",
    "sourcePath": "https://arxiv.org/abs/2409.01083",
    "status": "auto-summary-light"
  },
  {
    "slug": "flow-policy",
    "num": 45,
    "title": "FlowPolicy: 3D Flow-based Policy via Consistency Flow Matching",
    "topic": "diffusion-policy",
    "topicLabel": "Diffusion Policy",
    "era": "frontier",
    "year": 2025,
    "venue": "AAAI",
    "difficulty": 4,
    "tldr": "让机器人不再\"在脑子里画 100 张草稿才动手\"，而是看一眼立体世界就一步给出动作 — 又快又稳，真机能跑得动。",
    "wordCount": 2634,
    "readingMinutes": 8,
    "tags": [
      "diffusion",
      "flow-matching",
      "vision",
      "manipulation",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/flow-policy/",
    "sourcePath": "https://arxiv.org/abs/2412.04987",
    "status": "auto-summary-light"
  },
  {
    "slug": "pi0-fast",
    "num": 46,
    "title": "FAST: Efficient Action Tokenization for VLA",
    "topic": "diffusion-policy",
    "topicLabel": "Diffusion Policy",
    "era": "frontier",
    "year": 2025,
    "venue": "RSS",
    "difficulty": 4,
    "tldr": "机器人动作又长又啰嗦塞不进 AI 模型，FAST 学 MP3 压音乐的办法，把一长串动作压成几十个\"词\"，AI 像说话一样把它念出来。",
    "wordCount": 2492,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "language",
      "manipulation",
      "VLA",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/pi0-fast/",
    "sourcePath": "https://arxiv.org/abs/2501.09747",
    "status": "auto-summary-light"
  },
  {
    "slug": "pi0",
    "num": 47,
    "title": "pi_0: Vision-Language-Action Flow Model",
    "topic": "diffusion-policy",
    "topicLabel": "Diffusion Policy",
    "era": "frontier",
    "year": 2024,
    "venue": "arXiv",
    "difficulty": 4,
    "tldr": "让机器人看懂场景、听懂指令、还能丝滑动起来——拿现成的图文大模型当\"大脑\"，再加一个会画连续动作的\"流匹配\"小头。",
    "wordCount": 2618,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "flow-matching",
      "transformer",
      "language",
      "vision",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/pi0/",
    "sourcePath": "https://arxiv.org/abs/2410.24164",
    "status": "auto-summary-light"
  },
  {
    "slug": "pi05",
    "num": 48,
    "title": "pi_0.5: VLA with Open-World Generalization",
    "topic": "diffusion-policy",
    "topicLabel": "Diffusion Policy",
    "era": "frontier",
    "year": 2025,
    "venue": "arXiv",
    "difficulty": 5,
    "tldr": "让机器人第一次走进一个陌生人家，也能听懂\"收拾下厨房\"然后自己一步步把活干完。",
    "wordCount": 2353,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "flow-matching",
      "transformer",
      "language",
      "imitation",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/pi05/",
    "sourcePath": "https://arxiv.org/abs/2504.16054",
    "status": "auto-summary-light"
  },
  {
    "slug": "dagger",
    "num": 49,
    "title": "A Reduction of Imitation Learning and Structured Prediction to No-Regret Online Learning",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "founder",
    "year": 2011,
    "venue": "AISTATS",
    "difficulty": 4,
    "tldr": "光看老师开车的录像不够 — 学生一走偏就越错越离谱。DAgger 让学生自己先开几圈，把走偏的地方拿去问老师答案，再训，反复几轮就稳了。",
    "wordCount": 7322,
    "readingMinutes": 21,
    "tags": [
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dagger/",
    "sourcePath": "papers/dagger/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "gail",
    "num": 50,
    "title": "Generative Adversarial Imitation Learning",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "founder",
    "year": 2016,
    "venue": "NeurIPS",
    "difficulty": 4,
    "tldr": "让 AI 看大厨做菜的录像，再找个\"挑刺老师\"分辨它做得像不像，靠这种较劲学会做事，不用猜大厨心里的打分标准。",
    "wordCount": 6398,
    "readingMinutes": 18,
    "tags": [
      "RL",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/gail/",
    "sourcePath": "papers/gail/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "act-aloha",
    "num": 51,
    "title": "Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (ACT/ALOHA)",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "classic",
    "year": 2023,
    "venue": "RSS",
    "difficulty": 3,
    "tldr": "几千美元搭一套双臂遥控器（ALOHA）让人录 50 次示范，机器人就学会一段一段动（ACT），能完成穿扎带这种细活。",
    "wordCount": 2601,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "manipulation",
      "RL",
      "imitation",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/act-aloha/",
    "sourcePath": "https://arxiv.org/abs/2304.13705",
    "status": "auto-summary-light"
  },
  {
    "slug": "anyteleop",
    "num": 52,
    "title": "AnyTeleop",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "classic",
    "year": 2023,
    "venue": "CoRL",
    "difficulty": 3,
    "tldr": "用一台普通摄像头拍你的手，机械手就跟着模仿你的动作；换什么型号的机械手都不用重写代码。",
    "wordCount": 2280,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "3D",
      "vision",
      "manipulation",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/anyteleop/",
    "sourcePath": "https://arxiv.org/abs/2307.04577",
    "status": "auto-summary-light"
  },
  {
    "slug": "bet",
    "num": 53,
    "title": "Behavior Transformers: Cloning k Modes with One Stone",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "classic",
    "year": 2022,
    "venue": "NeurIPS",
    "difficulty": 3,
    "tldr": "看一堆人做同一件事却各有各的做法，BeT 让 AI 先认出\"有几种主流流派\"，再在每个流派里微调——而不是把所有动作平均成一个四不像。",
    "wordCount": 2385,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "language",
      "RL",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/bet/",
    "sourcePath": "https://arxiv.org/abs/2206.11251",
    "status": "auto-summary-light"
  },
  {
    "slug": "ibc",
    "num": 54,
    "title": "Implicit Behavioral Cloning",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "classic",
    "year": 2021,
    "venue": "CoRL",
    "difficulty": 4,
    "tldr": "别让模型直接报\"动作是这个\"，而是让它给一堆候选动作打分、挑最低分那个——机器人的手就突然变巧了。",
    "wordCount": 6215,
    "readingMinutes": 18,
    "tags": [
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/ibc/",
    "sourcePath": "papers/ibc/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "robocat",
    "num": 55,
    "title": "RoboCat",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "classic",
    "year": 2023,
    "venue": "TMLR",
    "difficulty": 4,
    "tldr": "一个 AI 大脑同时指挥好几种不同的机械臂干活，干完还会把成功的录像收回来当作下一轮的教材，越练越强。",
    "wordCount": 2370,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "language",
      "vision",
      "manipulation",
      "RL"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/robocat/",
    "sourcePath": "https://arxiv.org/abs/2306.11706",
    "status": "auto-summary-light"
  },
  {
    "slug": "aloha-2",
    "num": 56,
    "title": "ALOHA 2",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "frontier",
    "year": 2024,
    "venue": "Tech Report",
    "difficulty": 2,
    "tldr": "ALOHA 2 不是新算法，而是把\"教机器人用双手干活\"的那台设备升级了一遍：更顺手、更耐用、图纸全开源，方便大家一起攒训练数据。",
    "wordCount": 2546,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "manipulation",
      "imitation",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/aloha-2/",
    "sourcePath": "https://arxiv.org/abs/2405.02292",
    "status": "auto-summary-light"
  },
  {
    "slug": "dexcap",
    "num": 57,
    "title": "DexCap",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "frontier",
    "year": 2024,
    "venue": "RSS",
    "difficulty": 3,
    "tldr": "人戴上\"会记录动作的手套\"自己干活，把手的轨迹录下来教机器人——机器人完全不必在现场。",
    "wordCount": 2481,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "vision",
      "manipulation",
      "navigation",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dexcap/",
    "sourcePath": "https://arxiv.org/abs/2403.07788",
    "status": "auto-summary-light"
  },
  {
    "slug": "humanplus",
    "num": 58,
    "title": "HumanPlus",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "frontier",
    "year": 2024,
    "venue": "CoRL",
    "difficulty": 4,
    "tldr": "HumanPlus 让机器人当场跟着人做动作，做几十次后机器人自己也会了——把人当成机器人的\"示范老师\"。",
    "wordCount": 2152,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "transformer",
      "locomotion",
      "RL",
      "imitation",
      "sim2real"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/humanplus/",
    "sourcePath": "https://arxiv.org/abs/2406.10454",
    "status": "auto-summary-light"
  },
  {
    "slug": "idp3",
    "num": 59,
    "title": "Generalizable Humanoid Manipulation with 3D Diffusion Policies (iDP3)",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "frontier",
    "year": 2025,
    "venue": "RSS",
    "difficulty": 4,
    "tldr": "让人形机器人用\"自己眼睛\"的视角看世界（而不是死记房间地图）。换间屋子也照样干活，不用重学。",
    "wordCount": 2572,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "3D",
      "language",
      "vision",
      "manipulation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/idp3/",
    "sourcePath": "https://arxiv.org/abs/2410.10803",
    "status": "auto-summary-light"
  },
  {
    "slug": "mobile-aloha",
    "num": 60,
    "title": "Mobile ALOHA",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "frontier",
    "year": 2024,
    "venue": "CoRL",
    "difficulty": 3,
    "tldr": "给桌面机器人加了一辆小车，让人手把手带它做家务（炒虾、擦桌、洗碗），每招只示范 50 次就能学会。",
    "wordCount": 2235,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "transformer",
      "vision",
      "locomotion",
      "navigation",
      "RL"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/mobile-aloha/",
    "sourcePath": "https://arxiv.org/abs/2401.02117",
    "status": "auto-summary-light"
  },
  {
    "slug": "smolvla",
    "num": 61,
    "title": "SmolVLA",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "frontier",
    "year": 2025,
    "venue": "arXiv",
    "difficulty": 3,
    "tldr": "Hugging Face 推出的小型机器人模型：把\"看到 + 听到 + 动手\"塞进一张游戏显卡能训的小脑袋，让没数据中心的人也能在家玩具身 AI。",
    "wordCount": 2004,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "flow-matching",
      "language",
      "manipulation",
      "imitation",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/smolvla/",
    "sourcePath": "https://arxiv.org/abs/2506.01844",
    "status": "auto-summary-light"
  },
  {
    "slug": "umi",
    "num": 62,
    "title": "Universal Manipulation Interface",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "frontier",
    "year": 2024,
    "venue": "RSS",
    "difficulty": 3,
    "tldr": "人手拿一个\"带摄像头的夹子\"在厨房自己做事，录下来就能教机器人，全程不用机器人在场。",
    "wordCount": 2724,
    "readingMinutes": 8,
    "tags": [
      "diffusion",
      "vision",
      "manipulation",
      "navigation",
      "imitation",
      "sim2real"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/umi/",
    "sourcePath": "https://arxiv.org/abs/2402.10329",
    "status": "auto-summary-light"
  },
  {
    "slug": "vq-bet",
    "num": 63,
    "title": "Behavior Generation with Latent Actions (VQ-BeT)",
    "topic": "imitation",
    "topicLabel": "Imitation Learning",
    "era": "frontier",
    "year": 2024,
    "venue": "ICML",
    "difficulty": 4,
    "tldr": "机器人本来要画一条平滑曲线动作，VQ-BeT 让它改成\"先选一个动作词、再小修一点\"——就像挑表情包再加文字，比硬画曲线更不容易出怪招。",
    "wordCount": 2379,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "language",
      "imitation",
      "VLA",
      "sim2real"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/vq-bet/",
    "sourcePath": "https://arxiv.org/abs/2403.03181",
    "status": "auto-summary-light"
  },
  {
    "slug": "imagebind",
    "num": 64,
    "title": "ImageBind: One Embedding Space To Bind Them All",
    "topic": "multimodal",
    "topicLabel": "Multimodal Ecology",
    "era": "founder",
    "year": 2023,
    "venue": "CVPR",
    "difficulty": 3,
    "tldr": "把图片当翻译官，六种感官（图、文、声、深度、热、动作）就能互相听懂彼此说话。",
    "wordCount": 6102,
    "readingMinutes": 17,
    "tags": [
      "transformer",
      "vision"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/imagebind/",
    "sourcePath": "papers/imagebind/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "touch-vision-cross-modal",
    "num": 65,
    "title": "Connecting Touch and Vision via Cross-Modal Prediction",
    "topic": "multimodal",
    "topicLabel": "Multimodal Ecology",
    "era": "founder",
    "year": 2019,
    "venue": "CVPR",
    "difficulty": 3,
    "tldr": "教 AI\"看一眼就猜出摸起来什么感觉、摸一下就猜出在摸哪儿\"，让视觉和触觉互相翻译。",
    "wordCount": 6810,
    "readingMinutes": 19,
    "tags": [
      "vision",
      "tactile",
      "audio-speech",
      "navigation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/touch-vision-cross-modal/",
    "sourcePath": "papers/touch-vision-cross-modal/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "anymal",
    "num": 66,
    "title": "AnyMAL: An Efficient and Scalable Any-Modality Augmented Language Model",
    "topic": "multimodal",
    "topicLabel": "Multimodal Ecology",
    "era": "classic",
    "year": 2023,
    "venue": "EACL",
    "difficulty": 3,
    "tldr": "一句话：给一个\"只识字\"的聪明大脑配几副翻译眼镜——看图、看视频、听声、感运动，统统先翻成\"假文字\"再喂进去，大脑本身一个字都不重学。 三件让人眼前一亮的事： 不动 LLM 主干：LLaMA-2-70B 全程冻结，只训前面那个小投影层（projection layer），训练成本",
    "wordCount": 5896,
    "readingMinutes": 17,
    "tags": [
      "language",
      "vision"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/anymal/",
    "sourcePath": "papers/anymal/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "audiopalm",
    "num": 67,
    "title": "AudioPaLM",
    "topic": "multimodal",
    "topicLabel": "Multimodal Ecology",
    "era": "classic",
    "year": 2023,
    "venue": "arXiv",
    "difficulty": 4,
    "tldr": "以前要三个工人接力——听写、翻译、配音——才能把你说的中文变成英文语音。AudioPaLM 让一个模型一口气干完，连你的音色都不丢。",
    "wordCount": 2284,
    "readingMinutes": 7,
    "tags": [
      "transformer",
      "language",
      "audio-speech",
      "navigation",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/audiopalm/",
    "sourcePath": "https://arxiv.org/abs/2306.12925",
    "status": "auto-summary-light"
  },
  {
    "slug": "fromage",
    "num": 68,
    "title": "FROMAGe: Grounding LLMs to Images",
    "topic": "multimodal",
    "topicLabel": "Multimodal Ecology",
    "era": "classic",
    "year": 2023,
    "venue": "ICML",
    "difficulty": 3,
    "tldr": "把一个会说话的大模型整个冻住不动，只在它前后各加一层薄薄的\"翻译片\"，就让它能看图、找图、还能图文混着聊天。",
    "wordCount": 2225,
    "readingMinutes": 6,
    "tags": [
      "language",
      "vision"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/fromage/",
    "sourcePath": "https://arxiv.org/abs/2301.13823",
    "status": "auto-summary-light"
  },
  {
    "slug": "onellm",
    "num": 69,
    "title": "OneLLM",
    "topic": "multimodal",
    "topicLabel": "Multimodal Ecology",
    "era": "classic",
    "year": 2024,
    "venue": "CVPR",
    "difficulty": 3,
    "tldr": "OneLLM 用一套通用「翻译机」，让大语言模型同时听懂图像、声音、点云等八种信号——加新信号只要少量训练，不用从头再做。",
    "wordCount": 2291,
    "readingMinutes": 7,
    "tags": [
      "transformer",
      "language",
      "vision",
      "audio-speech",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/onellm/",
    "sourcePath": "https://arxiv.org/abs/2312.03700",
    "status": "auto-summary-light"
  },
  {
    "slug": "x-vlm",
    "num": 70,
    "title": "X-VLM: Multi-Grained Vision Language Pre-Training",
    "topic": "multimodal",
    "topicLabel": "Multimodal Ecology",
    "era": "classic",
    "year": 2022,
    "venue": "ICML",
    "difficulty": 4,
    "tldr": "教 AI 看图，不只学\"整张图配整句话\"，还学\"图里某个物体配某个词\"——这样问图里某个细节也答得准。",
    "wordCount": 2377,
    "readingMinutes": 7,
    "tags": [
      "transformer",
      "language",
      "vision",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/x-vlm/",
    "sourcePath": "https://arxiv.org/abs/2111.08276",
    "status": "auto-summary-light"
  },
  {
    "slug": "sparsh-x",
    "num": 71,
    "title": "Tactile Beyond Pixels (Sparsh-X)",
    "topic": "multimodal",
    "topicLabel": "Multimodal Ecology",
    "era": "frontier",
    "year": 2025,
    "venue": "CoRL",
    "difficulty": 4,
    "tldr": "让机器人的手指不止\"看\"接触画面，还能听响声、感力度、察打滑——四路信号一起学，摸东西才像人。",
    "wordCount": 2491,
    "readingMinutes": 7,
    "tags": [
      "transformer",
      "vision",
      "tactile",
      "manipulation",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/sparsh-x/",
    "sourcePath": "https://arxiv.org/abs/2506.14754",
    "status": "auto-summary-light"
  },
  {
    "slug": "sparsh",
    "num": 72,
    "title": "Sparsh: Self-supervised Touch Representations",
    "topic": "multimodal",
    "topicLabel": "Multimodal Ecology",
    "era": "frontier",
    "year": 2024,
    "venue": "CoRL",
    "difficulty": 4,
    "tldr": "以前每个触觉任务都得从零教机器人。Sparsh 先让模型自己看大量触觉画面学一遍，再做具体任务只要少量例子就够。 类比：跟小孩先摸过几千次东西、再去学\"握紧水杯\"是一个道理。技术路线和 NLP 里 BERT、视觉里 DINO 一致——先大量自学，再小量微调，只是搬到了触觉这个长期",
    "wordCount": 2885,
    "readingMinutes": 8,
    "tags": [
      "transformer",
      "vision",
      "tactile",
      "VLA",
      "sim2real",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/sparsh/",
    "sourcePath": "https://arxiv.org/abs/2410.24090",
    "status": "auto-summary-light"
  },
  {
    "slug": "tactile-vla",
    "num": 73,
    "title": "Tactile-VLA",
    "topic": "multimodal",
    "topicLabel": "Multimodal Ecology",
    "era": "frontier",
    "year": 2025,
    "venue": "CoRL",
    "difficulty": 4,
    "tldr": "让机器人除了会看会听，还学会\"摸\"——能感到扣子\"咔哒\"卡入那一下，干插拔、拧螺丝这种细活不再蛮干。",
    "wordCount": 2702,
    "readingMinutes": 8,
    "tags": [
      "transformer",
      "language",
      "vision",
      "tactile",
      "imitation",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/tactile-vla/",
    "sourcePath": "https://arxiv.org/abs/2507.09160",
    "status": "auto-summary-light"
  },
  {
    "slug": "tla-tactile-language-action",
    "num": 74,
    "title": "TLA: Tactile-Language-Action",
    "topic": "multimodal",
    "topicLabel": "Multimodal Ecology",
    "era": "frontier",
    "year": 2025,
    "venue": "ICRA",
    "difficulty": 4,
    "tldr": "让机器人像你闭眼摸钥匙那样——靠\"一段持续的触感\"加上一句话指令，自己决定下一步该怎么用手。",
    "wordCount": 2334,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "3D",
      "language",
      "vision",
      "tactile"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/tla-tactile-language-action/",
    "sourcePath": "https://arxiv.org/abs/2503.08548",
    "status": "auto-summary-light"
  },
  {
    "slug": "code-as-policies",
    "num": 75,
    "title": "Code as Policies: Language Model Programs for Embodied Control",
    "topic": "planning",
    "topicLabel": "High-Level Planning",
    "era": "founder",
    "year": 2023,
    "venue": "ICRA",
    "difficulty": 3,
    "tldr": "你说一句\"把方块叠进碗里\"，AI 当场写几行 Python 代码，机器人立刻照着跑。不用提前教它新动作。",
    "wordCount": 5094,
    "readingMinutes": 15,
    "tags": [
      "language",
      "manipulation",
      "RL",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/code-as-policies/",
    "sourcePath": "papers/code-as-policies/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "inner-monologue",
    "num": 76,
    "title": "Inner Monologue: Embodied Reasoning through Planning with Language Models",
    "topic": "planning",
    "topicLabel": "High-Level Planning",
    "era": "founder",
    "year": 2022,
    "venue": "CoRL",
    "difficulty": 3,
    "tldr": "让机器人边干活边在心里念叨：看到啥、做成没、人改主意没，全翻成文字塞回 AI，它就能边做边改计划。",
    "wordCount": 5505,
    "readingMinutes": 16,
    "tags": [
      "language",
      "vision",
      "manipulation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/inner-monologue/",
    "sourcePath": "papers/inner-monologue/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "llm-plus-p",
    "num": 77,
    "title": "LLM+P: Empowering LLMs with Optimal Planning",
    "topic": "planning",
    "topicLabel": "High-Level Planning",
    "era": "founder",
    "year": 2023,
    "venue": "arXiv",
    "difficulty": 3,
    "tldr": "让 LLM 只当翻译——把你说的话翻译成机器格式，真正的规划交给老牌算法去算。LLM 管说话，算法管动脑子。",
    "wordCount": 1995,
    "readingMinutes": 6,
    "tags": [
      "language",
      "RL",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/llm-plus-p/",
    "sourcePath": "https://arxiv.org/abs/2304.11477",
    "status": "auto-summary-light"
  },
  {
    "slug": "palm-e",
    "num": 78,
    "title": "PaLM-E: An Embodied Multimodal Language Model",
    "topic": "planning",
    "topicLabel": "High-Level Planning",
    "era": "founder",
    "year": 2023,
    "venue": "ICML",
    "difficulty": 4,
    "tldr": "教 ChatGPT 长出眼睛和手脚：你说一句话 + 让它瞄一眼现场，它直接列出机器人该做的几步。",
    "wordCount": 6964,
    "readingMinutes": 20,
    "tags": [
      "transformer",
      "language",
      "vision",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/palm-e/",
    "sourcePath": "papers/palm-e/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "progprompt",
    "num": 79,
    "title": "ProgPrompt",
    "topic": "planning",
    "topicLabel": "High-Level Planning",
    "era": "founder",
    "year": 2023,
    "venue": "ICRA",
    "difficulty": 2,
    "tldr": "让大模型像写代码一样做计划：你说\"把苹果放冰箱\"，它直接吐出一串 Python 调用，机器人照着一行行跑就行。",
    "wordCount": 2154,
    "readingMinutes": 6,
    "tags": [
      "language",
      "VLA",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/progprompt/",
    "sourcePath": "https://arxiv.org/abs/2209.11302",
    "status": "auto-summary-light"
  },
  {
    "slug": "chatgpt-for-robotics",
    "num": 80,
    "title": "ChatGPT for Robotics",
    "topic": "planning",
    "topicLabel": "High-Level Planning",
    "era": "classic",
    "year": 2023,
    "venue": "IEEE Access",
    "difficulty": 2,
    "tldr": "教 ChatGPT 当机器人的\"代写助理\"：先告诉它机器人会做哪些事，再让它把人话翻成代码，人盯着改。",
    "wordCount": 2194,
    "readingMinutes": 6,
    "tags": [
      "language",
      "RL",
      "imitation",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/chatgpt-for-robotics/",
    "sourcePath": "https://arxiv.org/abs/2306.17582",
    "status": "auto-summary-light"
  },
  {
    "slug": "gensim",
    "num": 81,
    "title": "GenSim",
    "topic": "planning",
    "topicLabel": "High-Level Planning",
    "era": "classic",
    "year": 2024,
    "venue": "ICLR",
    "difficulty": 3,
    "tldr": "让 ChatGPT 当\"出题老师\"，自动给机器人编一堆练习关卡，连标准答案也一起写好。",
    "wordCount": 2114,
    "readingMinutes": 6,
    "tags": [
      "language",
      "vision",
      "manipulation",
      "imitation",
      "sim2real",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/gensim/",
    "sourcePath": "https://arxiv.org/abs/2310.01361",
    "status": "auto-summary-light"
  },
  {
    "slug": "roboflamingo",
    "num": 82,
    "title": "RoboFlamingo",
    "topic": "planning",
    "topicLabel": "High-Level Planning",
    "era": "classic",
    "year": 2024,
    "venue": "ICLR",
    "difficulty": 4,
    "tldr": "拿一个已经会看图说话的现成大模型当大脑，后面接一只\"小手\"，就教会机械臂干活——不用从头训。",
    "wordCount": 2089,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "flow-matching",
      "transformer",
      "language",
      "vision",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/roboflamingo/",
    "sourcePath": "https://arxiv.org/abs/2311.01378",
    "status": "auto-summary-light"
  },
  {
    "slug": "tree-planner",
    "num": 83,
    "title": "Tree-Planner",
    "topic": "planning",
    "topicLabel": "High-Level Planning",
    "era": "classic",
    "year": 2024,
    "venue": "ICLR",
    "difficulty": 3,
    "tldr": "让大模型一次写好十份菜谱，把重复步骤合成一棵树，做菜时照树走，错了就换条岔路，不用反复打电话问。",
    "wordCount": 2402,
    "readingMinutes": 7,
    "tags": [
      "language",
      "locomotion",
      "world-model",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/tree-planner/",
    "sourcePath": "https://arxiv.org/abs/2310.08582",
    "status": "auto-summary-light"
  },
  {
    "slug": "voxposer",
    "num": 84,
    "title": "VoxPoser",
    "topic": "planning",
    "topicLabel": "High-Level Planning",
    "era": "classic",
    "year": 2023,
    "venue": "CoRL",
    "difficulty": 4,
    "tldr": "VoxPoser 让大模型给机器人画两张 3D 地图：红色地方要去，灰色地方要躲，机器人照着地图走出动作，全程不训练新模型。",
    "wordCount": 2090,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "3D",
      "language",
      "vision",
      "RL",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/voxposer/",
    "sourcePath": "https://arxiv.org/abs/2307.05973",
    "status": "auto-summary-light"
  },
  {
    "slug": "millimap",
    "num": 85,
    "title": "See Through Smoke: Robust Indoor Mapping with Low-cost mmWave Radar",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "founder",
    "year": 2020,
    "venue": "SenSys",
    "difficulty": 3,
    "tldr": "机器人在浓烟里也能画出清晰的房间地图——靠一颗几十块的小雷达加一个会\"脑补\"的神经网络。 具体两招： 训练时让贵的激光雷达（lidar）和便宜的雷达坐同一辆车，把 lidar 的清晰图当作业答案喂给神经网络（cGAN），教雷达学会脑补。学完老师下车，雷达单飞。 认门/墙/玻璃/电",
    "wordCount": 6108,
    "readingMinutes": 17,
    "tags": [
      "vision",
      "RF-radar",
      "navigation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/millimap/",
    "sourcePath": "papers/millimap/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "person-in-wifi",
    "num": 86,
    "title": "Can WiFi Estimate Person Pose?",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "founder",
    "year": 2019,
    "venue": "ICCV",
    "difficulty": 3,
    "tldr": "想象你家路由器除了上网，还能告诉你\"屋里那个人正在做啥姿势\"——胳膊抬到哪、腿怎么弯，全画给你看。",
    "wordCount": 6303,
    "readingMinutes": 18,
    "tags": [
      "RF-radar"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/person-in-wifi/",
    "sourcePath": "papers/person-in-wifi/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "3drimr",
    "num": 87,
    "title": "3DRIMR: 3D Reconstruction and Imaging via mmWave Radar based on Deep Learning",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "classic",
    "year": 2021,
    "venue": "IPCCC",
    "difficulty": 3,
    "tldr": "用 AI 教小雷达\"看清\"物体长啥样：从糊糊的电波信号里还原出完整 3D 形状，烟雾灰尘暗光里也能用。",
    "wordCount": 2599,
    "readingMinutes": 7,
    "tags": [
      "3D",
      "RF-radar",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/3drimr/",
    "sourcePath": "https://arxiv.org/abs/2108.02858",
    "status": "auto-summary-light"
  },
  {
    "slug": "milliego",
    "num": 88,
    "title": "milliEgo: Single-chip mmWave Radar Aided Egomotion Estimation via Deep Sensor Fusion",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "classic",
    "year": 2020,
    "venue": "SenSys",
    "difficulty": 3,
    "tldr": "把便宜的毫米波雷达和身上的\"动作感应器\"（IMU）用神经网络拼起来，让机器在黑暗、烟雾里也能算出自己走到了哪。",
    "wordCount": 2179,
    "readingMinutes": 6,
    "tags": [
      "transformer",
      "vision",
      "RF-radar",
      "navigation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/milliego/",
    "sourcePath": "https://arxiv.org/abs/2006.02266",
    "status": "auto-summary-light"
  },
  {
    "slug": "radarhd",
    "num": 89,
    "title": "High Resolution Point Clouds from mmWave Radar",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "classic",
    "year": 2023,
    "venue": "ICRA",
    "difficulty": 3,
    "tldr": "便宜雷达拍出来的画面很糊。RadarHD 用神经网络当翻译，把糊画面改成像激光雷达那样清晰的点云图，烟雾、黑暗里都能用。",
    "wordCount": 2660,
    "readingMinutes": 8,
    "tags": [
      "3D",
      "RF-radar",
      "navigation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/radarhd/",
    "sourcePath": "https://arxiv.org/abs/2206.09273",
    "status": "auto-summary-light"
  },
  {
    "slug": "radarslam",
    "num": 90,
    "title": "RadarSLAM: Radar based Large-Scale SLAM in All Weathers",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "classic",
    "year": 2020,
    "venue": "BMVC",
    "difficulty": 4,
    "tldr": "让一台\"会转圈的雷达\"在大雾大雪天里也能给车画地图、记住自己走过哪。",
    "wordCount": 2561,
    "readingMinutes": 7,
    "tags": [
      "vision",
      "RF-radar",
      "navigation",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/radarslam/",
    "sourcePath": "https://arxiv.org/abs/2005.02198",
    "status": "auto-summary-light"
  },
  {
    "slug": "rf-pose-through-wall",
    "num": 91,
    "title": "Through-Wall Pose Imaging in Real-Time with a Many-to-Many Encoder/Decoder Paradigm",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "classic",
    "year": 2019,
    "venue": "arXiv",
    "difficulty": 4,
    "tldr": "一个 Wi-Fi 小盒子隔着墙照过去，就能画出屋里人的骨架动画——摄像头当老师，电波当学生，学一遍就会了。 更具体一点： 输入：一个商用雷达（Walabot Developer，几百美元）发出去的电波被人体反射回来后形成的 3D 强度场。 输出：屋内每个人的 15 关节点骨架，每",
    "wordCount": 11437,
    "readingMinutes": 33,
    "tags": [
      "vision",
      "RF-radar"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rf-pose-through-wall/",
    "sourcePath": "papers/rf-pose-through-wall/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "rfmask",
    "num": 92,
    "title": "RFMask: A Simple Baseline for Human Silhouette Segmentation with Radio Signals",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "classic",
    "year": 2022,
    "venue": "TMM",
    "difficulty": 3,
    "tldr": "漆黑屋子里相机看不见，但雷达回波能\"听\"出人形。RFMask 让模型把雷达信号直接画成每个人的精细剪影——头、肩、胳膊都画出来。",
    "wordCount": 2615,
    "readingMinutes": 7,
    "tags": [
      "vision",
      "RF-radar"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rfmask/",
    "sourcePath": "https://arxiv.org/abs/2201.10175",
    "status": "auto-summary-light"
  },
  {
    "slug": "rfpose-ot",
    "num": 93,
    "title": "RFPose-OT: RF-Based 3D Human Pose Estimation via Optimal Transport Theory",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "classic",
    "year": 2023,
    "venue": "TCSVT",
    "difficulty": 4,
    "tldr": "用雷达回声画出人的姿势：直接学容易乱猜，先把\"回声\"和\"姿势\"两边的特征对齐，再画关节，换房间也更稳。",
    "wordCount": 2475,
    "readingMinutes": 7,
    "tags": [
      "3D",
      "RF-radar"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rfpose-ot/",
    "sourcePath": "https://arxiv.org/abs/2301.13013",
    "status": "auto-summary-light"
  },
  {
    "slug": "argus-mmego",
    "num": 94,
    "title": "Argus: Multi-View Egocentric Human Mesh Reconstruction Based on Stripped-Down Wearable mmWave Add-on",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "frontier",
    "year": 2024,
    "venue": "SenSys",
    "difficulty": 4,
    "tldr": "在肩膀、胸口、手腕各贴一片简化雷达，每片只能看到身体一小块，算法把这些局部信号拼成完整的 3D 人体形状。",
    "wordCount": 2389,
    "readingMinutes": 7,
    "tags": [
      "transformer",
      "3D",
      "vision",
      "audio-speech",
      "RF-radar"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/argus-mmego/",
    "sourcePath": "https://arxiv.org/abs/2411.00419",
    "status": "auto-summary-light"
  },
  {
    "slug": "mmdiff",
    "num": 95,
    "title": "Diffusion Model is a Good Pose Estimator from 3D RF-Vision",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "frontier",
    "year": 2024,
    "venue": "CVPR",
    "difficulty": 4,
    "tldr": "毫米波雷达拍出的人像隔了层毛玻璃。这篇论文让 AI 从噪点里一笔笔擦出人体骨架，比一次猜准稳得多。CVPR 2024 收录。",
    "wordCount": 2894,
    "readingMinutes": 8,
    "tags": [
      "diffusion",
      "transformer",
      "3D",
      "vision",
      "RF-radar",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/mmdiff/",
    "sourcePath": "https://arxiv.org/abs/2403.16198",
    "status": "auto-summary-light"
  },
  {
    "slug": "panoradar",
    "num": 96,
    "title": "Enabling Visual Recognition at Radio Frequency (PanoRadar)",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "frontier",
    "year": 2024,
    "venue": "MobiCom",
    "difficulty": 4,
    "tldr": "PanoRadar 把便宜的小雷达装到一个转台上边转边扫，再让神经网络把模糊回声拼成 3D 地图，让雷达像眼睛一样\"看见\"房间。",
    "wordCount": 2696,
    "readingMinutes": 8,
    "tags": [
      "vision",
      "audio-speech",
      "RF-radar",
      "navigation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/panoradar/",
    "sourcePath": "https://arxiv.org/abs/2405.19516",
    "status": "auto-summary-light"
  },
  {
    "slug": "wave-former",
    "num": 97,
    "title": "Wave-Former: Through-Occlusion 3D Reconstruction via Wireless Shape Completion",
    "topic": "rf",
    "topicLabel": "RF Perception & Mapping",
    "era": "frontier",
    "year": 2025,
    "venue": "arXiv",
    "difficulty": 4,
    "tldr": "毫米波信号能穿过纸箱、布帘，Wave-Former 把弹回来的模糊回声拼成藏在背后的杯子、瓶子的完整 3D 形状。",
    "wordCount": 2485,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "3D",
      "vision",
      "audio-speech",
      "RF-radar"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/wave-former/",
    "sourcePath": "https://arxiv.org/abs/2511.14152",
    "status": "auto-summary-light"
  },
  {
    "slug": "habitat",
    "num": 98,
    "title": "Habitat: A Platform for Embodied AI Research",
    "topic": "sim",
    "topicLabel": "Simulation & Sim2Real",
    "era": "founder",
    "year": 2019,
    "venue": "ICCV",
    "difficulty": 2,
    "tldr": "给家用机器人造一个跑得飞快的\"VR 房子\"，让它在里面绕路撞墙练几千万步，再上岗去你家。",
    "wordCount": 6056,
    "readingMinutes": 17,
    "tags": [
      "vision",
      "navigation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/habitat/",
    "sourcePath": "papers/habitat/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "isaac-gym",
    "num": 99,
    "title": "Isaac Gym: High Performance GPU-Based Physics Simulation For Robot Learning",
    "topic": "sim",
    "topicLabel": "Simulation & Sim2Real",
    "era": "founder",
    "year": 2021,
    "venue": "NeurIPS Datasets",
    "difficulty": 3,
    "tldr": "一句话：把\"算物理\"和\"训神经网络\"塞进同一张显卡，机器人学走路从\"几千台 CPU 跑一晚\"压成\"一张卡跑几分钟\"。 类比：以前训机器人像切菜、炒菜、装盘分三个房间，端来端去比真做菜还累；Isaac Gym 把厨房合并，菜不动、工具换着上。 效果对照：OpenAI 训魔方机械手用",
    "wordCount": 5362,
    "readingMinutes": 15,
    "tags": [
      "RL",
      "sim2real"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/isaac-gym/",
    "sourcePath": "papers/isaac-gym/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "dexmv",
    "num": 100,
    "title": "DexMV",
    "topic": "sim",
    "topicLabel": "Simulation & Sim2Real",
    "era": "classic",
    "year": 2022,
    "venue": "ECCV",
    "difficulty": 4,
    "tldr": "让机械手学拧瓶盖、倒水太难，DexMV 让算法看人手视频学，把人的动作\"翻译\"成仿真里机械手能照着练的示范。",
    "wordCount": 2271,
    "readingMinutes": 6,
    "tags": [
      "3D",
      "vision",
      "manipulation",
      "RL",
      "imitation",
      "sim2real"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dexmv/",
    "sourcePath": "https://arxiv.org/abs/2108.05877",
    "status": "auto-summary-light"
  },
  {
    "slug": "habitat-2",
    "num": 101,
    "title": "Habitat 2.0",
    "topic": "sim",
    "topicLabel": "Simulation & Sim2Real",
    "era": "classic",
    "year": 2021,
    "venue": "NeurIPS",
    "difficulty": 3,
    "tldr": "上一代 Habitat 只能在虚拟房子里走路看；2.0 让小机器人能真的开冰箱、把杯子从厨房拿到客厅做家务。",
    "wordCount": 2156,
    "readingMinutes": 6,
    "tags": [
      "3D",
      "manipulation",
      "RL",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/habitat-2/",
    "sourcePath": "https://arxiv.org/abs/2106.14405",
    "status": "auto-summary-light"
  },
  {
    "slug": "maniskill",
    "num": 102,
    "title": "ManiSkill",
    "topic": "sim",
    "topicLabel": "Simulation & Sim2Real",
    "era": "classic",
    "year": 2021,
    "venue": "NeurIPS",
    "difficulty": 3,
    "tldr": "ManiSkill 是教机器人开抽屉、开柜门这种家具的统一考场—— 专测它练完几十个柜子之后，能不能上手没见过的第 101 个。",
    "wordCount": 2042,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "3D",
      "vision",
      "manipulation",
      "RL",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/maniskill/",
    "sourcePath": "https://arxiv.org/abs/2107.14483",
    "status": "auto-summary-light"
  },
  {
    "slug": "procthor",
    "num": 103,
    "title": "ProcTHOR",
    "topic": "sim",
    "topicLabel": "Simulation & Sim2Real",
    "era": "classic",
    "year": 2022,
    "venue": "NeurIPS",
    "difficulty": 3,
    "tldr": "过去训练 AI 在屋里走来走去，得人工一间一间搭样板房，慢且少。ProcTHOR 让电脑按规则批量造 1 万套房，AI 见多了，换个没去过的房子也能找到东西。",
    "wordCount": 2071,
    "readingMinutes": 6,
    "tags": [
      "language",
      "sim2real",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/procthor/",
    "sourcePath": "https://arxiv.org/abs/2206.06994",
    "status": "auto-summary-light"
  },
  {
    "slug": "sapien",
    "num": 104,
    "title": "SAPIEN: A SimulAted Part-based Interactive ENvironment",
    "topic": "sim",
    "topicLabel": "Simulation & Sim2Real",
    "era": "classic",
    "year": 2020,
    "venue": "CVPR",
    "difficulty": 3,
    "tldr": "给机器人造了一个虚拟宜家展厅，2,346 件家具每个抽屉、每扇门、每个瓶盖都能真的拉开、推开、拧开。",
    "wordCount": 6256,
    "readingMinutes": 18,
    "tags": [
      "3D",
      "RL"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/sapien/",
    "sourcePath": "papers/sapien/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "behavior-1k",
    "num": 105,
    "title": "BEHAVIOR-1K",
    "topic": "sim",
    "topicLabel": "Simulation & Sim2Real",
    "era": "frontier",
    "year": 2024,
    "venue": "CoRL",
    "difficulty": 4,
    "tldr": "斯坦福搭的\"机器人家务考场\"：1000 道家务题、50 间样板房、9000 多件物品，让所有人用同一把尺子比\"机器人到底会不会做家务\"。",
    "wordCount": 1983,
    "readingMinutes": 6,
    "tags": [
      "3D",
      "RL",
      "imitation",
      "VLA",
      "sim2real",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/behavior-1k/",
    "sourcePath": "https://arxiv.org/abs/2403.09227",
    "status": "auto-summary-light"
  },
  {
    "slug": "habitat-3",
    "num": 106,
    "title": "Habitat 3.0",
    "topic": "sim",
    "topicLabel": "Simulation & Sim2Real",
    "era": "frontier",
    "year": 2024,
    "venue": "ICLR",
    "difficulty": 3,
    "tldr": "在虚拟的家里加一个会走会动的\"假人\"，让机器人练习扫地搬东西时，得学会一边干活一边躲人、配合人。",
    "wordCount": 2405,
    "readingMinutes": 7,
    "tags": [
      "language",
      "manipulation",
      "locomotion",
      "navigation",
      "RL",
      "sim2real"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/habitat-3/",
    "sourcePath": "https://arxiv.org/abs/2310.13724",
    "status": "auto-summary-light"
  },
  {
    "slug": "isaac-lab",
    "num": 107,
    "title": "Isaac Lab",
    "topic": "sim",
    "topicLabel": "Simulation & Sim2Real",
    "era": "frontier",
    "year": 2025,
    "venue": "arXiv",
    "difficulty": 3,
    "tldr": "机器人在电脑里\"练功\"的虚拟训练场。以前练得飞快但看不清画面，画面漂亮又练得慢；Isaac Lab 把这两件事捏到了一起。",
    "wordCount": 1944,
    "readingMinutes": 6,
    "tags": [
      "manipulation",
      "locomotion",
      "RL",
      "sim2real",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/isaac-lab/",
    "sourcePath": "https://arxiv.org/abs/2511.04831",
    "status": "auto-summary-light"
  },
  {
    "slug": "mujoco-playground",
    "num": 108,
    "title": "MuJoCo Playground",
    "topic": "sim",
    "topicLabel": "Simulation & Sim2Real",
    "era": "frontier",
    "year": 2025,
    "venue": "arXiv",
    "difficulty": 3,
    "tldr": "一个 pip install 就能装好的开源仿真平台，让机器人先在电脑里把走路、抓东西练熟，再几乎原样搬到真机上跑。",
    "wordCount": 1892,
    "readingMinutes": 5,
    "tags": [
      "3D",
      "manipulation",
      "locomotion",
      "sim2real"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/mujoco-playground/",
    "sourcePath": "https://arxiv.org/abs/2502.08844",
    "status": "auto-summary-light"
  },
  {
    "slug": "rt-1",
    "num": 109,
    "title": "RT-1: Robotics Transformer for Real-World Control at Scale",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "founder",
    "year": 2022,
    "venue": "RSS",
    "difficulty": 3,
    "tldr": "让机器人看完 13 万段人类亲手示范，就能听一句中文，在真办公室里把可乐罐拿出来放进抽屉。",
    "wordCount": 6767,
    "readingMinutes": 19,
    "tags": [
      "transformer",
      "language",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rt-1/",
    "sourcePath": "papers/rt-1/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "dp3",
    "num": 110,
    "title": "3D Diffusion Policy (DP3)",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "classic",
    "year": 2024,
    "venue": "RSS",
    "difficulty": 3,
    "tldr": "教机器人擦桌子，不给它看照片，改给它看带深度的 3D 点云。结果只用 10 段录像就够学会一个新任务。",
    "wordCount": 2157,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "transformer",
      "3D",
      "vision",
      "manipulation",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dp3/",
    "sourcePath": "https://arxiv.org/abs/2403.03954",
    "status": "auto-summary-light"
  },
  {
    "slug": "octo",
    "num": 111,
    "title": "Octo: An Open-Source Generalist Robot Policy",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "classic",
    "year": 2024,
    "venue": "RSS",
    "difficulty": 3,
    "tldr": "第一个真正开源的通用机器人\"大脑\"：先看 80 万段机器人录像学基础动作，你下载回来微调几小时，就能让自家机器人学新活。",
    "wordCount": 2256,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "flow-matching",
      "transformer",
      "language",
      "vision",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/octo/",
    "sourcePath": "https://arxiv.org/abs/2405.12213",
    "status": "auto-summary-light"
  },
  {
    "slug": "rt-2",
    "num": 112,
    "title": "RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "classic",
    "year": 2023,
    "venue": "CoRL",
    "difficulty": 4,
    "tldr": "把机器人动作翻译成一句话，让会看图聊天的 AI 用写句子的方式开口指挥机器人——它会写字，就能动手。",
    "wordCount": 6897,
    "readingMinutes": 20,
    "tags": [
      "transformer",
      "language",
      "vision",
      "VLA",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rt-2/",
    "sourcePath": "papers/rt-2/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "rt-trajectory",
    "num": 113,
    "title": "RT-Trajectory: Robotic Task Generalization via Hindsight Trajectory Sketches",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "classic",
    "year": 2023,
    "venue": "ICLR",
    "difficulty": 3,
    "tldr": "教机器人做新动作，光说话不够、给一张完成图也不够。这篇论文说：在画面上画一条\"手该走的路\"——机器人立刻照着做。",
    "wordCount": 6022,
    "readingMinutes": 17,
    "tags": [
      "diffusion",
      "transformer",
      "language",
      "vision",
      "manipulation",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rt-trajectory/",
    "sourcePath": "papers/rt-trajectory/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "3d-vla",
    "num": 114,
    "title": "3D-VLA",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "frontier",
    "year": 2024,
    "venue": "ICML",
    "difficulty": 4,
    "tldr": "让机器人除了看平面照片，还能\"摸到\"立体形状；动手前先在脑里画一张\"做完后的样子\"，再照着画面去动。",
    "wordCount": 2220,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "transformer",
      "3D",
      "language",
      "vision",
      "RL"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/3d-vla/",
    "sourcePath": "https://arxiv.org/abs/2403.09631",
    "status": "auto-summary-light"
  },
  {
    "slug": "dexvla",
    "num": 115,
    "title": "DexVLA",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "frontier",
    "year": 2025,
    "venue": "arXiv",
    "difficulty": 4,
    "tldr": "让一个只会\"看图说话\"的大脑别动，给它配一只 10 亿参数的\"专业的手\"。脑负责理解，手负责干活，互不干扰。",
    "wordCount": 2527,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "flow-matching",
      "vision",
      "manipulation",
      "VLA",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dexvla/",
    "sourcePath": "https://arxiv.org/abs/2502.05855",
    "status": "auto-summary-light"
  },
  {
    "slug": "gr-2",
    "num": 116,
    "title": "GR-2: Generative Video-Language-Action Model",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "frontier",
    "year": 2024,
    "venue": "arXiv",
    "difficulty": 4,
    "tldr": "让机器人先刷 3800 万段网络视频攒常识，再练动手；它干活时脑子里会\"预演\"下一秒的画面。",
    "wordCount": 2625,
    "readingMinutes": 8,
    "tags": [
      "diffusion",
      "transformer",
      "language",
      "RL",
      "world-model",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/gr-2/",
    "sourcePath": "https://arxiv.org/abs/2410.06158",
    "status": "auto-summary-light"
  },
  {
    "slug": "openhelix",
    "num": 117,
    "title": "OpenHelix",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "frontier",
    "year": 2025,
    "venue": "arXiv",
    "difficulty": 3,
    "tldr": "机器人版的\"大脑加小脑\"分工：大脑慢慢听懂你说的话，小脑飞快动手干活。代码全部开源，对标 Figure 公司不公开的 Helix。",
    "wordCount": 2535,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "flow-matching",
      "transformer",
      "language",
      "VLA",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/openhelix/",
    "sourcePath": "https://arxiv.org/abs/2505.03912",
    "status": "auto-summary-light"
  },
  {
    "slug": "openvla-oft",
    "num": 118,
    "title": "OpenVLA-OFT",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "frontier",
    "year": 2025,
    "venue": "RSS",
    "difficulty": 3,
    "tldr": "原版机器人模型一个字一个字念动作，慢还一抖一抖。OpenVLA-OFT 拧开三个开关——一口气说、一段段说、说连续数字——又快又稳。",
    "wordCount": 2105,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "transformer",
      "language",
      "vision",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/openvla-oft/",
    "sourcePath": "https://arxiv.org/abs/2502.19645",
    "status": "auto-summary-light"
  },
  {
    "slug": "rdt-1b",
    "num": 119,
    "title": "RDT-1B: Diffusion Foundation Model for Bimanual Manipulation",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "frontier",
    "year": 2024,
    "venue": "ICLR",
    "difficulty": 4,
    "tldr": "清华团队给双臂机器人配的\"大脑\"：10 亿参数，听一句话就能让两只机械臂配合着倒水、叠衣服。",
    "wordCount": 2388,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "flow-matching",
      "transformer",
      "language",
      "vision",
      "manipulation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/rdt-1b/",
    "sourcePath": "https://arxiv.org/abs/2410.07864",
    "status": "auto-summary-light"
  },
  {
    "slug": "robomamba",
    "num": 120,
    "title": "RoboMamba",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "frontier",
    "year": 2024,
    "venue": "NeurIPS",
    "difficulty": 3,
    "tldr": "机器人脑子原本用 Transformer 拼出来，反应慢、显存吃紧。RoboMamba 换成 Mamba（一种\"流水线式\"架构），让机器人想得更快、更省。",
    "wordCount": 1977,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "transformer",
      "mamba-ssm",
      "language",
      "vision",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/robomamba/",
    "sourcePath": "https://arxiv.org/abs/2406.04339",
    "status": "auto-summary-light"
  },
  {
    "slug": "spatialvla",
    "num": 121,
    "title": "SpatialVLA",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "frontier",
    "year": 2025,
    "venue": "arXiv",
    "difficulty": 4,
    "tldr": "教机器人两件事：用普通摄像头也能看出远近；常用动作存成肌肉记忆，不用每次重新算。",
    "wordCount": 2096,
    "readingMinutes": 6,
    "tags": [
      "transformer",
      "3D",
      "vision",
      "VLA",
      "VLM",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/spatialvla/",
    "sourcePath": "https://arxiv.org/abs/2501.15830",
    "status": "auto-summary-light"
  },
  {
    "slug": "tinyvla",
    "num": 122,
    "title": "TinyVLA",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "frontier",
    "year": 2024,
    "venue": "RA-L",
    "difficulty": 3,
    "tldr": "把会听话的机器人大脑瘦身到 1.4B，动作生成换成\"先乱后凿\"的扩散模型，不靠云端也能实时干活。",
    "wordCount": 2382,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "flow-matching",
      "transformer",
      "language",
      "vision",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/tinyvla/",
    "sourcePath": "https://arxiv.org/abs/2409.12514",
    "status": "auto-summary-light"
  },
  {
    "slug": "tracevla",
    "num": 123,
    "title": "TraceVLA: Visual Trace Prompting",
    "topic": "vla",
    "topicLabel": "End-to-End VLA",
    "era": "frontier",
    "year": 2024,
    "venue": "ICLR",
    "difficulty": 3,
    "tldr": "机器人的手刚走过哪里？TraceVLA 把这条路径直接画在它看到的照片上，让它看见自己的足迹，再决定下一步往哪动。",
    "wordCount": 2136,
    "readingMinutes": 6,
    "tags": [
      "transformer",
      "mamba-ssm",
      "language",
      "vision",
      "VLA",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/tracevla/",
    "sourcePath": "https://arxiv.org/abs/2412.10345",
    "status": "auto-summary-light"
  },
  {
    "slug": "clip",
    "num": 124,
    "title": "Learning Transferable Visual Models From Natural Language Supervision",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "founder",
    "year": 2021,
    "venue": "ICML",
    "difficulty": 3,
    "tldr": "教 AI 同时认图和认字，把 4 亿对网上图文塞进同一张坐标。之后你说\"一只猫\"，它就能从新图里挑出猫——不用为新任务再训一遍。",
    "wordCount": 6293,
    "readingMinutes": 18,
    "tags": [
      "transformer",
      "language",
      "vision"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/clip/",
    "sourcePath": "papers/clip/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "flamingo",
    "num": 125,
    "title": "Flamingo: a Visual Language Model for Few-Shot Learning",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "founder",
    "year": 2022,
    "venue": "NeurIPS",
    "difficulty": 4,
    "tldr": "教一个会聊天的 AI 也学会看图，给它看两三个示范，它就能照着做新题。",
    "wordCount": 6302,
    "readingMinutes": 18,
    "tags": [
      "transformer",
      "language",
      "vision"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/flamingo/",
    "sourcePath": "papers/flamingo/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "blip-2",
    "num": 126,
    "title": "BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "classic",
    "year": 2023,
    "venue": "ICML",
    "difficulty": 4,
    "tldr": "BLIP-2 不动两个大模型——一个负责看图、一个负责说话——只在中间训练一个小\"翻译\"，就让 AI 学会了看图说话。",
    "wordCount": 2678,
    "readingMinutes": 8,
    "tags": [
      "transformer",
      "language",
      "vision",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/blip-2/",
    "sourcePath": "https://arxiv.org/abs/2301.12597",
    "status": "auto-summary-light"
  },
  {
    "slug": "blip",
    "num": 127,
    "title": "BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "classic",
    "year": 2022,
    "venue": "ICML",
    "difficulty": 3,
    "tldr": "一句话：让一个模型同时学会看图和写字，再让它帮自己把网上烂配文重写干净，回头再用干净数据训一遍——多个任务全线变强。 三个关键贡献： MED（Multimodal mixture of Encoder-Decoder）：一个模型三种身份切换——纯编码器、看图的文本编码器、看图的文",
    "wordCount": 5849,
    "readingMinutes": 17,
    "tags": [
      "transformer",
      "language",
      "vision"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/blip/",
    "sourcePath": "papers/blip/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "deepseek-vl",
    "num": 128,
    "title": "DeepSeek-VL: Towards Real-World Vision-Language Understanding",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "classic",
    "year": 2024,
    "venue": "arXiv",
    "difficulty": 3,
    "tldr": "DeepSeek 在 2024 年开源的\"会看图\"小模型，主打能看清发票、PPT、论文截图里的小字，不只会答考试题。",
    "wordCount": 2729,
    "readingMinutes": 8,
    "tags": [
      "language",
      "vision",
      "VLM",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/deepseek-vl/",
    "sourcePath": "https://arxiv.org/abs/2403.05525",
    "status": "auto-summary-light"
  },
  {
    "slug": "eva-clip",
    "num": 129,
    "title": "EVA-CLIP: Improved Training Techniques for CLIP at Scale",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "classic",
    "year": 2023,
    "venue": "arXiv",
    "difficulty": 3,
    "tldr": "不改 CLIP 架构，只改训练流程：用一个已经\"懂图\"的视觉模型起步 + 训练时只看半张图——更少数据反而训出更强的看图模型。",
    "wordCount": 2824,
    "readingMinutes": 8,
    "tags": [
      "transformer",
      "vision"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/eva-clip/",
    "sourcePath": "https://arxiv.org/abs/2303.15389",
    "status": "auto-summary-light"
  },
  {
    "slug": "filip",
    "num": 130,
    "title": "FILIP: Fine-grained Interactive Language-Image Pre-Training",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "classic",
    "year": 2022,
    "venue": "ICLR",
    "difficulty": 3,
    "tldr": "以前是\"整张图配整句话\"，FILIP 让图的每一小块和句子的每个词互相找最像的伙伴，模型就能学会\"狗在左下角\"这种细节。",
    "wordCount": 2695,
    "readingMinutes": 8,
    "tags": [
      "transformer",
      "vision"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/filip/",
    "sourcePath": "https://arxiv.org/abs/2111.07783",
    "status": "auto-summary-light"
  },
  {
    "slug": "florence-2",
    "num": 131,
    "title": "Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "classic",
    "year": 2024,
    "venue": "CVPR",
    "difficulty": 3,
    "tldr": "一个看图模型，你跟它说\"圈猫\"\"描述这张图\"\"找红车\"它都能用同一个脑子做，回答全是一段文字。",
    "wordCount": 2232,
    "readingMinutes": 6,
    "tags": [
      "transformer",
      "language",
      "vision",
      "VLM",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/florence-2/",
    "sourcePath": "https://arxiv.org/abs/2311.06242",
    "status": "auto-summary-light"
  },
  {
    "slug": "internvl",
    "num": 132,
    "title": "InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "classic",
    "year": 2024,
    "venue": "CVPR",
    "difficulty": 4,
    "tldr": "让\"看图的脑子\"也长到 6B 参数，和\"会说话的脑子\"一样大，AI 看图说话才不偏科，而且开源就能用。",
    "wordCount": 2239,
    "readingMinutes": 6,
    "tags": [
      "transformer",
      "language",
      "vision",
      "VLM",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/internvl/",
    "sourcePath": "https://arxiv.org/abs/2312.14238",
    "status": "auto-summary-light"
  },
  {
    "slug": "llava-1-5",
    "num": 133,
    "title": "Improved Baselines with Visual Instruction Tuning",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "classic",
    "year": 2024,
    "venue": "CVPR",
    "difficulty": 2,
    "tldr": "给会聊天的 AI 配一副\"看图眼镜\"。把眼镜从一片镜片换成两片，再多给它看点带字的图片，看图答题就刷榜了。",
    "wordCount": 2181,
    "readingMinutes": 6,
    "tags": [
      "language",
      "vision",
      "VLM",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/llava-1-5/",
    "sourcePath": "https://arxiv.org/abs/2310.03744",
    "status": "auto-summary-light"
  },
  {
    "slug": "obelics",
    "num": 134,
    "title": "OBELICS",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "classic",
    "year": 2023,
    "venue": "NeurIPS",
    "difficulty": 3,
    "tldr": "HuggingFace 把网上 1.41 亿个\"图文穿插\"的网页洗干净打包开源，让大家也能像 DeepMind 那样训出会看图读长文的模型。",
    "wordCount": 2181,
    "readingMinutes": 6,
    "tags": [
      "language",
      "vision",
      "VLM",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/obelics/",
    "sourcePath": "https://arxiv.org/abs/2306.16527",
    "status": "auto-summary-light"
  },
  {
    "slug": "qwen-vl",
    "num": 135,
    "title": "Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "classic",
    "year": 2023,
    "venue": "arXiv",
    "difficulty": 3,
    "tldr": "给会聊天的 AI 戴副眼镜：一次学会看图、念中英文招牌、用框指出物体、还能多轮聊天。这就是阿里 2023 年开源的 Qwen-VL。",
    "wordCount": 2132,
    "readingMinutes": 6,
    "tags": [
      "language",
      "vision",
      "VLM",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/qwen-vl/",
    "sourcePath": "https://arxiv.org/abs/2308.12966",
    "status": "auto-summary-light"
  },
  {
    "slug": "siglip",
    "num": 136,
    "title": "Sigmoid Loss for Language Image Pre-Training",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "classic",
    "year": 2023,
    "venue": "ICCV",
    "difficulty": 3,
    "tldr": "教模型\"图配文字\"，CLIP 要全班一起排名打分，SigLIP 改成一对一判断\"是不是一对\"。算得快、省内存、小批也能学。",
    "wordCount": 2180,
    "readingMinutes": 6,
    "tags": [
      "transformer",
      "vision",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/siglip/",
    "sourcePath": "https://arxiv.org/abs/2303.15343",
    "status": "auto-summary-light"
  },
  {
    "slug": "idefics-2",
    "num": 137,
    "title": "What matters when building vision-language models?",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "frontier",
    "year": 2024,
    "venue": "NeurIPS",
    "difficulty": 3,
    "tldr": "做\"看图说话 AI\"时大家凭感觉选零件，这篇把每个选择拆开做对照实验，整理成一份避坑清单，再训了个 8B 模型当样板。",
    "wordCount": 2201,
    "readingMinutes": 6,
    "tags": [
      "transformer",
      "language",
      "vision",
      "VLM",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/idefics-2/",
    "sourcePath": "https://arxiv.org/abs/2405.02246",
    "status": "auto-summary-light"
  },
  {
    "slug": "internvl-2-5",
    "num": 138,
    "title": "Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "frontier",
    "year": 2024,
    "venue": "arXiv",
    "difficulty": 4,
    "tldr": "把模型、数据、推理三件事一起加大，让免费开源的看图模型第一次在大学考试里追上顶级闭源模型。",
    "wordCount": 2863,
    "readingMinutes": 8,
    "tags": [
      "language",
      "vision",
      "VLM",
      "dataset",
      "open-source"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/internvl-2-5/",
    "sourcePath": "https://arxiv.org/abs/2412.05271",
    "status": "auto-summary-light"
  },
  {
    "slug": "llama-3-herd",
    "num": 139,
    "title": "The Llama 3 Herd of Models",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "frontier",
    "year": 2024,
    "venue": "arXiv",
    "difficulty": 4,
    "tldr": "Meta 把训练 Llama 3 大模型的全套\"菜谱\"公开了——用了什么料、多少张卡、跑多久、考多少分。",
    "wordCount": 1959,
    "readingMinutes": 6,
    "tags": [
      "transformer",
      "language",
      "vision",
      "RL",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/llama-3-herd/",
    "sourcePath": "https://arxiv.org/abs/2407.21783",
    "status": "auto-summary-light"
  },
  {
    "slug": "llava-next-interleave",
    "num": 140,
    "title": "LLaVA-NeXT-Interleave",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "frontier",
    "year": 2024,
    "venue": "arXiv",
    "difficulty": 3,
    "tldr": "教 AI 像刷图文并茂的小红书：图和字按顺序穿着读，多图、视频、3D 都用这一招，不用各训一个模型。",
    "wordCount": 2328,
    "readingMinutes": 7,
    "tags": [
      "3D",
      "language",
      "vision",
      "VLM",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/llava-next-interleave/",
    "sourcePath": "https://arxiv.org/abs/2407.07895",
    "status": "auto-summary-light"
  },
  {
    "slug": "llava-onevision",
    "num": 141,
    "title": "LLaVA-OneVision: Easy Visual Task Transfer",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "frontier",
    "year": 2024,
    "venue": "arXiv",
    "difficulty": 3,
    "tldr": "一套配方教会一个模型同时看懂单张图、几张图、和视频，开源圈第一次在视频上接近 GPT-4V。",
    "wordCount": 1987,
    "readingMinutes": 6,
    "tags": [
      "language",
      "vision",
      "VLA",
      "VLM",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/llava-onevision/",
    "sourcePath": "https://arxiv.org/abs/2408.03326",
    "status": "auto-summary-light"
  },
  {
    "slug": "long-clip",
    "num": 142,
    "title": "Long-CLIP: Unlocking the Long-Text Capability of CLIP",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "frontier",
    "year": 2024,
    "venue": "ECCV",
    "difficulty": 3,
    "tldr": "给只能读 77 字短纸条的 CLIP 做两个小手术，让它能读 248 字的长纸条，但又没忘掉原来认识的那些短词。",
    "wordCount": 2395,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "language",
      "vision",
      "VLM"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/long-clip/",
    "sourcePath": "https://arxiv.org/abs/2403.15378",
    "status": "auto-summary-light"
  },
  {
    "slug": "pixtral-12b",
    "num": 143,
    "title": "Pixtral 12B",
    "topic": "vlm-foundation",
    "topicLabel": "VLM Foundation",
    "era": "frontier",
    "year": 2024,
    "venue": "arXiv",
    "difficulty": 3,
    "tldr": "Mistral 开源的\"会看图聊天的助手\"——从一开始就同时学看图和说话，图想多大就多大，能免费拿去做产品。",
    "wordCount": 2068,
    "readingMinutes": 6,
    "tags": [
      "transformer",
      "language",
      "vision",
      "VLM",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/pixtral-12b/",
    "sourcePath": "https://arxiv.org/abs/2410.07073",
    "status": "auto-summary-light"
  },
  {
    "slug": "dreamer-v1",
    "num": 144,
    "title": "Dream to Control: Learning Behaviors by Latent Imagination",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "founder",
    "year": 2020,
    "venue": "ICLR",
    "difficulty": 4,
    "tldr": "教 AI 在脑子里反复\"做白日梦\"演练动作，不用真去摔跤，就能学会跑步、翻跟头这种复杂动作。",
    "wordCount": 6449,
    "readingMinutes": 18,
    "tags": [
      "mamba-ssm",
      "RL",
      "world-model"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dreamer-v1/",
    "sourcePath": "papers/dreamer-v1/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "world-models-ha",
    "num": 145,
    "title": "World Models",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "founder",
    "year": 2018,
    "venue": "NeurIPS",
    "difficulty": 3,
    "tldr": "让 AI 先在自己脑子里反复\"做白日梦\"练打游戏，练熟了再去真游戏里上场——居然真能赢。",
    "wordCount": 6473,
    "readingMinutes": 18,
    "tags": [
      "RL",
      "world-model"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/world-models-ha/",
    "sourcePath": "papers/world-models-ha/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "daydreamer",
    "num": 146,
    "title": "DayDreamer",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "classic",
    "year": 2022,
    "venue": "CoRL",
    "difficulty": 3,
    "tldr": "让一只四足机器人不靠仿真，在真实世界里 1 小时就学会走路——靠的是边走边在脑子里\"做梦\"演练。",
    "wordCount": 2994,
    "readingMinutes": 9,
    "tags": [
      "mamba-ssm",
      "vision",
      "RL",
      "imitation",
      "world-model",
      "sim2real"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/daydreamer/",
    "sourcePath": "https://arxiv.org/abs/2206.14176",
    "status": "auto-summary-light"
  },
  {
    "slug": "dreamer-v2",
    "num": 147,
    "title": "Mastering Atari with Discrete World Models",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "classic",
    "year": 2021,
    "venue": "ICLR",
    "difficulty": 4,
    "tldr": "让 AI 闭眼\"做白日梦\"练打老游戏，第一次只靠脑子里想象就打到人类水平。",
    "wordCount": 6048,
    "readingMinutes": 17,
    "tags": [
      "mamba-ssm",
      "vision",
      "RL",
      "imitation",
      "world-model"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dreamer-v2/",
    "sourcePath": "papers/dreamer-v2/paper.pdf",
    "status": "auto-summary"
  },
  {
    "slug": "dreamer-v3",
    "num": 148,
    "title": "Dreamer V3: Mastering Diverse Domains through World Models",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "classic",
    "year": 2025,
    "venue": "Nature",
    "difficulty": 4,
    "tldr": "同一套设置，让一个 AI 自己玩 150 多种游戏都不用改参数，还第一次靠自己挖到《我的世界》里的钻石。",
    "wordCount": 2507,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "mamba-ssm",
      "RL",
      "world-model",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/dreamer-v3/",
    "sourcePath": "https://arxiv.org/abs/2301.04104",
    "status": "auto-summary-light"
  },
  {
    "slug": "iris-world-model",
    "num": 149,
    "title": "Transformers are Sample-Efficient World Models",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "classic",
    "year": 2023,
    "venue": "ICLR",
    "difficulty": 4,
    "tldr": "把游戏画面切成一格格\"积木\"，让 AI 像写句子一样接龙下一帧，然后让它在脑子里\"自己跟自己玩\"练强化学习——只玩两小时就接近人类水平。",
    "wordCount": 2446,
    "readingMinutes": 7,
    "tags": [
      "transformer",
      "mamba-ssm",
      "language",
      "RL",
      "world-model"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/iris-world-model/",
    "sourcePath": "https://arxiv.org/abs/2209.00588",
    "status": "auto-summary-light"
  },
  {
    "slug": "transformer-world-model",
    "num": 150,
    "title": "TWM: Transformer-based World Models",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "classic",
    "year": 2023,
    "venue": "ICLR",
    "difficulty": 4,
    "tldr": "agent 在脑子里\"做梦\"练本事。这篇把梦的引擎从 RNN 换成 Transformer，记得更长，做得更准。",
    "wordCount": 2258,
    "readingMinutes": 6,
    "tags": [
      "transformer",
      "mamba-ssm",
      "language",
      "RL",
      "world-model",
      "dataset"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/transformer-world-model/",
    "sourcePath": "https://arxiv.org/abs/2303.07109",
    "status": "auto-summary-light"
  },
  {
    "slug": "1x-world-model-2025",
    "num": 151,
    "title": "1X World Model Challenge",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "frontier",
    "year": 2025,
    "venue": "arXiv",
    "difficulty": 3,
    "tldr": "1X 教人形机器人 Neo \"脑补下一秒画面\"：拿现成视频 AI 当底子，喂自家机器人录像微调，再做成公开赛让大家来卷。",
    "wordCount": 2600,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "language",
      "RL",
      "imitation",
      "world-model"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/1x-world-model-2025/",
    "sourcePath": "https://arxiv.org/abs/2510.07092",
    "status": "auto-summary-light"
  },
  {
    "slug": "cosmos-world-foundation",
    "num": 152,
    "title": "Cosmos World Foundation Model Platform",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "frontier",
    "year": 2025,
    "venue": "arXiv",
    "difficulty": 5,
    "tldr": "NVIDIA 用 2000 万小时真实视频，训了一个能\"猜下一秒物理世界长啥样\"的大模型，给机器人和无人车当通用底座。",
    "wordCount": 2284,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "language",
      "RL",
      "world-model",
      "VLA"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/cosmos-world-foundation/",
    "sourcePath": "https://arxiv.org/abs/2501.03575",
    "status": "auto-summary-light"
  },
  {
    "slug": "gaia-1",
    "num": 153,
    "title": "GAIA-1",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "frontier",
    "year": 2023,
    "venue": "arXiv",
    "difficulty": 4,
    "tldr": "GAIA-1 是个会做梦的开车模拟器：给它一段街景视频的开头加一句\"我现在打方向盘\"，它能接着画出后面几秒街上看到的画面。",
    "wordCount": 2288,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "language",
      "vision",
      "RL",
      "imitation"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/gaia-1/",
    "sourcePath": "https://arxiv.org/abs/2309.17080",
    "status": "auto-summary-light"
  },
  {
    "slug": "genie",
    "num": 154,
    "title": "Genie: Generative Interactive Environments",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "frontier",
    "year": 2024,
    "venue": "ICML",
    "difficulty": 4,
    "tldr": "Genie 看一堆游戏录屏，自己猜出每帧之间\"按了什么键\"，再用这个\"按键\"画出下一帧——把死视频变成能玩的小游戏。",
    "wordCount": 2186,
    "readingMinutes": 6,
    "tags": [
      "diffusion",
      "transformer",
      "vision",
      "RL",
      "imitation",
      "world-model"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/genie/",
    "sourcePath": "https://arxiv.org/abs/2402.15391",
    "status": "auto-summary-light"
  },
  {
    "slug": "navigation-world-models",
    "num": 155,
    "title": "Navigation World Models",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "frontier",
    "year": 2025,
    "venue": "CVPR",
    "difficulty": 4,
    "tldr": "让机器人\"走\"之前先在脑子里放一段未来几秒的画面，看会不会撞墙，再决定真的怎么走。",
    "wordCount": 2468,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "transformer",
      "navigation",
      "RL",
      "world-model"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/navigation-world-models/",
    "sourcePath": "https://arxiv.org/abs/2412.03572",
    "status": "auto-summary-light"
  },
  {
    "slug": "unisim",
    "num": 156,
    "title": "UniSim",
    "topic": "world-model",
    "topicLabel": "World Model & Video Policy",
    "era": "frontier",
    "year": 2024,
    "venue": "ICLR",
    "difficulty": 4,
    "tldr": "看过海量视频后，你给它一个动作（说一句话 / 推一下机械臂 / 挪一下镜头），它就生成接下来世界长什么样的视频——像一台会脑补现实的\"游戏机\"。",
    "wordCount": 2530,
    "readingMinutes": 7,
    "tags": [
      "diffusion",
      "vision",
      "world-model",
      "VLM",
      "sim2real"
    ],
    "url": "https://estelledc.github.io/embodied-ai-reading-station/papers/unisim/",
    "sourcePath": "https://arxiv.org/abs/2310.06114",
    "status": "auto-summary-light"
  }
]