[PATCH v5 16/31] powernv/fadump: process the crashdump by exporting it as /proc/vmcore

Thu Sep 5 07:01:46 AEST 2019

On 04/09/19 5:12 PM, Michael Ellerman wrote:
> Hari Bathini <hbathini at linux.ibm.com> writes:
>> diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
>> index a755705..10f6086 100644
>> --- a/arch/powerpc/platforms/powernv/opal-fadump.c
>> +++ b/arch/powerpc/platforms/powernv/opal-fadump.c
>> @@ -41,6 +43,37 @@ static void opal_fadump_update_config(struct fw_dump *fadump_conf,
>>  	fadump_conf->fadumphdr_addr = fdm->fadumphdr_addr;
>>  }
>>  
>> +/*
>> + * This function is called in the capture kernel to get configuration details
>> + * from metadata setup by the first kernel.
>> + */
>> +static void opal_fadump_get_config(struct fw_dump *fadump_conf,
>> +				   const struct opal_fadump_mem_struct *fdm)
>> +{
>> +	int i;
>> +
>> +	if (!fadump_conf->dump_active)
>> +		return;
>> +
>> +	fadump_conf->boot_memory_size = 0;
>> +
>> +	pr_debug("Boot memory regions:\n");
>> +	for (i = 0; i < fdm->region_cnt; i++) {
>> +		pr_debug("\t%d. base: 0x%llx, size: 0x%llx\n",
>> +			 (i + 1), fdm->rgn[i].src, fdm->rgn[i].size);
> 
> Printing the zero-based array off by one (i + 1) seems confusing.

Hmmm... Indexing the regions from `0` sounded inappropriate..

> 
>> +
>> +		fadump_conf->boot_memory_size += fdm->rgn[i].size;
>> +	}
>> +
>> +	/*
>> +	 * Start address of reserve dump area (permanent reservation) for
>> +	 * re-registering FADump after dump capture.
>> +	 */
>> +	fadump_conf->reserve_dump_area_start = fdm->rgn[0].dest;
>> +
>> +	opal_fadump_update_config(fadump_conf, fdm);
>> +}
>> +
>>  /* Initialize kernel metadata */
>>  static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm)
>>  {
>> @@ -215,24 +248,114 @@ static void opal_fadump_cleanup(struct fw_dump *fadump_conf)
>>  		pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret);
>>  }
>>  
>> +/*
>> + * Convert CPU state data saved at the time of crash into ELF notes.
>> + */
>> +static int __init opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf)
>> +{
>> +	u32 num_cpus, *note_buf;
>> +	struct fadump_crash_info_header *fdh = NULL;
>> +
>> +	num_cpus = 1;
>> +	/* Allocate buffer to hold cpu crash notes. */
>> +	fadump_conf->cpu_notes_buf_size = num_cpus * sizeof(note_buf_t);
>> +	fadump_conf->cpu_notes_buf_size =
>> +		PAGE_ALIGN(fadump_conf->cpu_notes_buf_size);
>> +	note_buf = fadump_cpu_notes_buf_alloc(fadump_conf->cpu_notes_buf_size);
>> +	if (!note_buf) {
>> +		pr_err("Failed to allocate 0x%lx bytes for cpu notes buffer\n",
>> +		       fadump_conf->cpu_notes_buf_size);
>> +		return -ENOMEM;
>> +	}
>> +	fadump_conf->cpu_notes_buf = __pa(note_buf);
>> +
>> +	pr_debug("Allocated buffer for cpu notes of size %ld at %p\n",
>> +		 (num_cpus * sizeof(note_buf_t)), note_buf);
>> +
>> +	if (fadump_conf->fadumphdr_addr)
>> +		fdh = __va(fadump_conf->fadumphdr_addr);
>> +
>> +	if (fdh && (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN)) {
>> +		note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs));
>> +		final_note(note_buf);
>> +
>> +		pr_debug("Updating elfcore header (%llx) with cpu notes\n",
>> +			 fdh->elfcorehdr_addr);
>> +		fadump_update_elfcore_header(fadump_conf,
>> +					     __va(fdh->elfcorehdr_addr));
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>  static int __init opal_fadump_process(struct fw_dump *fadump_conf)
>>  {
>> -	return -EINVAL;
>> +	struct fadump_crash_info_header *fdh;
>> +	int rc = 0;
> > No need to initialise rc there.
> 

	rc = -EINVAL;

and

>> +	if (!opal_fdm_active || !fadump_conf->fadumphdr_addr)
>> +		return -EINVAL;

>> +
>> +	/* Validate the fadump crash info header */
>> +	fdh = __va(fadump_conf->fadumphdr_addr);
>> +	if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
>> +		pr_err("Crash info header is not valid.\n");
>> +		return -EINVAL;

	return rc; ??

>> +	}
>> +
>> +	/*
>> +	 * TODO: To build cpu notes, find a way to map PIR to logical id.
>> +	 *       Also, we may need different method for pseries and powernv.
>> +	 *       The currently booted kernel could have a different PIR to
>> +	 *       logical id mapping. So, try saving info of previous kernel's
>> +	 *       paca to get the right PIR to logical id mapping.
>> +	 */
> 
> That TODO is removed by the end of the series, so please just omit it entirely.
> 
>> +	rc = opal_fadump_build_cpu_notes(fadump_conf);
>> +	if (rc)
>> +		return rc;
> 
> I think this all runs early in boot, so we don't need to worry about
> another CPU seeing the partially initialised core due to there being no
> barrier here before we set elfcorehdr_addr?
> 

This is processed in fs/proc/vmcore.c during fs_initcall() and the data within the core
is processed much later (initrd). So, I think we are good here...

>> +	/*
>> +	 * We are done validating dump info and elfcore header is now ready
>> +	 * to be exported. set elfcorehdr_addr so that vmcore module will
>> +	 * export the elfcore header through '/proc/vmcore'.
>> +	 */
>> +	elfcorehdr_addr = fdh->elfcorehdr_addr;
> 
>> @@ -283,5 +407,42 @@ int __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, ulong node)
>>  	fadump_conf->ops		= &opal_fadump_ops;
>>  	fadump_conf->fadump_supported	= 1;
>>  
>> +	/*
>> +	 * Check if dump has been initiated on last reboot.
>> +	 */
>> +	prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
>> +	if (prop) {
> 
>         if (!prop)
>                 return 1;
> 
> And then everything below can be unindented.
> 
>> +		u64 addr = 0;
>> +		s64 ret;
>> +		const struct opal_fadump_mem_struct *r_opal_fdm_active;
> 
>   *
>  / \
>  /_\
>   |
> 

:) Will take care of such instances...
I think this should be added to checkpatch.pl

>> +
>> +		ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
>> +		if ((ret != OPAL_SUCCESS) || !addr) {
>> +			pr_err("Failed to get Kernel metadata (%lld)\n", ret);
>> +			return 1;
>> +		}
>> +
>> +		addr = be64_to_cpu(addr);
>> +		pr_debug("Kernel metadata addr: %llx\n", addr);
>> +
>> +		opal_fdm_active = __va(addr);
>> +		r_opal_fdm_active = (void *)addr;
> 
> Why do we need the r_ version?
> 
> We're called early in boot, so we are still in real mode, but that's
> fine the CPU will ignore the top bits of the virtual address for us.

I don't know if I am missing a trick here or if there is a bug somewhere
but trying to access `opal_fdm_active->version` is not working for me..

- Hari