Где syscall записывает набор ошибок в EINTR? - PullRequest
1 голос
/ 23 апреля 2020

Я обнаружил, что vim продолжает посылать сигнал каким-то образом внутри cephfs, если он позволяет создать файл .swp из strace (заканчивается системным вызовом wirte) и gdb (кадр 1 равен write_eintr):

# define vim_write(fd, buf, count)  write((fd), (char *)(buf), (size_t) (count))

...

/*
 * Version of write() that retries when interrupted by EINTR (possibly
 * by a SIGWINCH).
 */
    long
write_eintr(int fd, void *buf, size_t bufsize)
{
    long    ret = 0;
    long    wlen;

    /* Repeat the write() so long it didn't fail, other than being interrupted
     * by a signal. */
    while (ret < (long)bufsize)
    {
    wlen = vim_write(fd, (char *)buf + ret, bufsize - ret);
    if (wlen < 0)
    {
        if (errno != EINTR)
        break;
    }
    else
        ret += wlen;
    }
    return ret;
}
#endif

IIU C, l oop просто продолжают пытаться вызвать запись, как только она прерывается по сигналу (и errno будет установлен на EINTR). Интересно, где находится EINTR set , поэтому я пытаюсь копаться в ядре и glib c.

kernel :

const struct file_operations ceph_file_fops = {
    .open = ceph_open,
    .release = ceph_release,
    .llseek = ceph_llseek,
    .read_iter = ceph_read_iter,
    .write_iter = ceph_write_iter,
    .mmap = ceph_mmap,
    .fsync = ceph_fsync,
    .lock = ceph_lock,
    .flock = ceph_flock,
    .splice_read = generic_file_splice_read,
    .splice_write = iter_file_splice_write,
    .unlocked_ioctl = ceph_ioctl,
    .compat_ioctl = compat_ptr_ioctl,
    .fallocate  = ceph_fallocate,
    .copy_file_range = ceph_copy_file_range,
};

/*
 * Take cap references to avoid releasing caps to MDS mid-write.
 *
 * If we are synchronous, and write with an old snap context, the OSD
 * may return EOLDSNAPC.  In that case, retry the write.. _after_
 * dropping our cap refs and allowing the pending snap to logically
 * complete _before_ this write occurs.
 *
 * If we are near ENOSPC, write synchronously.
 */
static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
    struct file *file = iocb->ki_filp;
    struct ceph_file_info *fi = file->private_data;
    struct inode *inode = file_inode(file);
    struct ceph_inode_info *ci = ceph_inode(inode);
    struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
    struct ceph_osd_client *osdc = &fsc->client->osdc;
    struct ceph_cap_flush *prealloc_cf;
    ssize_t count, written = 0;
    int err, want, got;
    bool direct_lock = false;
    u32 map_flags;
    u64 pool_flags;
    loff_t pos;
    loff_t limit = max(i_size_read(inode), fsc->max_file_size);

    if (ceph_snap(inode) != CEPH_NOSNAP)
        return -EROFS;

    prealloc_cf = ceph_alloc_cap_flush();
    if (!prealloc_cf)
        return -ENOMEM;

    if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT)
        direct_lock = true;

retry_snap:
    if (direct_lock)
        ceph_start_io_direct(inode);
    else
        ceph_start_io_write(inode);

    /* We can write back this queue in page reclaim */
    current->backing_dev_info = inode_to_bdi(inode);

    if (iocb->ki_flags & IOCB_APPEND) {
        err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
        if (err < 0)
            goto out;
    }

    err = generic_write_checks(iocb, from);
    if (err <= 0)
        goto out;

    pos = iocb->ki_pos;
    if (unlikely(pos >= limit)) {
        err = -EFBIG;
        goto out;
    } else {
        iov_iter_truncate(from, limit - pos);
    }

    count = iov_iter_count(from);
    if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) {
        err = -EDQUOT;
        goto out;
    }

    err = file_remove_privs(file);
    if (err)
        goto out;

    err = file_update_time(file);
    if (err)
        goto out;

    inode_inc_iversion_raw(inode);

    if (ci->i_inline_version != CEPH_INLINE_NONE) {
        err = ceph_uninline_data(file, NULL);
        if (err < 0)
            goto out;
    }

    down_read(&osdc->lock);
    map_flags = osdc->osdmap->flags;
    pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
    up_read(&osdc->lock);
    if ((map_flags & CEPH_OSDMAP_FULL) ||
        (pool_flags & CEPH_POOL_FLAG_FULL)) {
        err = -ENOSPC;
        goto out;
    }

    dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
         inode, ceph_vinop(inode), pos, count, i_size_read(inode));
    if (fi->fmode & CEPH_FILE_MODE_LAZY)
        want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
    else
        want = CEPH_CAP_FILE_BUFFER;
    got = 0;
    err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count,
                &got, NULL);
    if (err < 0)
        goto out;

    dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
         inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));

    if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
        (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) ||
        (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
        struct ceph_snap_context *snapc;
        struct iov_iter data;

        spin_lock(&ci->i_ceph_lock);
        if (__ceph_have_pending_cap_snap(ci)) {
            struct ceph_cap_snap *capsnap =
                    list_last_entry(&ci->i_cap_snaps,
                            struct ceph_cap_snap,
                            ci_item);
            snapc = ceph_get_snap_context(capsnap->context);
        } else {
            BUG_ON(!ci->i_head_snapc);
            snapc = ceph_get_snap_context(ci->i_head_snapc);
        }
        spin_unlock(&ci->i_ceph_lock);

        /* we might need to revert back to that point */
        data = *from;
        if (iocb->ki_flags & IOCB_DIRECT)
            written = ceph_direct_read_write(iocb, &data, snapc,
                             &prealloc_cf);
        else
            written = ceph_sync_write(iocb, &data, pos, snapc);
        if (direct_lock)
            ceph_end_io_direct(inode);
        else
            ceph_end_io_write(inode);
        if (written > 0)
            iov_iter_advance(from, written);
        ceph_put_snap_context(snapc);
    } else {
        /*
         * No need to acquire the i_truncate_mutex. Because
         * the MDS revokes Fwb caps before sending truncate
         * message to us. We can't get Fwb cap while there
         * are pending vmtruncate. So write and vmtruncate
         * can not run at the same time
         */
        written = generic_perform_write(file, from, pos);
        if (likely(written >= 0))
            iocb->ki_pos = pos + written;
        ceph_end_io_write(inode);
    }

    if (written >= 0) {
        int dirty;

        spin_lock(&ci->i_ceph_lock);
        ci->i_inline_version = CEPH_INLINE_NONE;
        dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
                           &prealloc_cf);
        spin_unlock(&ci->i_ceph_lock);
        if (dirty)
            __mark_inode_dirty(inode, dirty);
        if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
            ceph_check_caps(ci, 0, NULL);
    }

    dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
         inode, ceph_vinop(inode), pos, (unsigned)count,
         ceph_cap_string(got));
    ceph_put_cap_refs(ci, got);

    if (written == -EOLDSNAPC) {
        dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
             inode, ceph_vinop(inode), pos, (unsigned)count);
        goto retry_snap;
    }

    if (written >= 0) {
        if ((map_flags & CEPH_OSDMAP_NEARFULL) ||
            (pool_flags & CEPH_POOL_FLAG_NEARFULL))
            iocb->ki_flags |= IOCB_DSYNC;
        written = generic_write_sync(iocb, written);
    }

    goto out_unlocked;
out:
    if (direct_lock)
        ceph_end_io_direct(inode);
    else
        ceph_end_io_write(inode);
out_unlocked:
    ceph_free_cap_flush(prealloc_cf);
    current->backing_dev_info = NULL;
    return written ? written : err;
}

glib c:

/* Write NBYTES of BUF to FD.  Return the number written, or -1.  */
ssize_t
__libc_write (int fd, const void *buf, size_t nbytes)
{
  if (nbytes == 0)
    return 0;
  if (fd < 0)
    {
      __set_errno (EBADF);
      return -1;
    }
  if (buf == NULL)
    {
      __set_errno (EINVAL);
      return -1;
    }

  __set_errno (ENOSYS);
  return -1;
}
libc_hidden_def (__libc_write)
stub_warning (write)

weak_alias (__libc_write, __write)
libc_hidden_weak (__write)
weak_alias (__libc_write, write)

, ни один из них не установил errno в EINTR.

1 Ответ

4 голосов
/ 23 апреля 2020

Это не фактическая реализация write, а фиктивная реализация для воображаемых целей, которые на самом деле не имеют системного вызова write. Фактическая функция является сборкой и генерируется тайным процессом в glib c время сборки из таблиц. Когда системный вызов возвращает состояние ошибки (для большинства арок значение >= -4095UL), он переходит к функции __syscall_error для установки errno вместо непосредственного возврата.

...